parser.rb in regexp-examples-1.1.4

- old
+ new

@@ -1,9 +1,26 @@
+require_relative 'parser_helpers/parse_group_helper'
+require_relative 'parser_helpers/parse_after_backslash_group_helper'
+require_relative 'parser_helpers/parse_multi_group_helper'
+require_relative 'parser_helpers/parse_repeater_helper'
+require_relative 'parser_helpers/charset_negation_helper'
+
+# :nodoc:
 module RegexpExamples
   IllegalSyntaxError = Class.new(StandardError)
+  # A Regexp parser, used to build a structured collection of objects that represents
+  # the regular expression.
+  # This object can then be used to generate strings that match the regular expression.
   class Parser
+    include ParseGroupHelper
+    include ParseAfterBackslashGroupHelper
+    include ParseMultiGroupHelper
+    include ParseRepeaterHelper
+    include CharsetNegationHelper
+
     attr_reader :regexp_string
+
     def initialize(regexp_string, regexp_options)
       @regexp_string = regexp_string
       @ignorecase = !(regexp_options & Regexp::IGNORECASE).zero?
       @multiline = !(regexp_options & Regexp::MULTILINE).zero?
       @extended = !(regexp_options & Regexp::EXTENDED).zero?
@@ -25,313 +42,42 @@
     private
 
     def parse_group(repeaters)
       case next_char
       when '('
-        group = parse_multi_group
+        parse_multi_group
       when '['
-        group = parse_char_group
+        parse_char_group
       when '.'
-        group = parse_dot_group
+        parse_dot_group
       when '|'
-        group = parse_or_group(repeaters)
+        parse_or_group(repeaters)
       when '\\'
-        group = parse_after_backslash_group
+        parse_after_backslash_group
       when '^'
-        group = parse_caret
+        parse_caret
       when '$'
-        group = parse_dollar
+        parse_dollar
       when /[#\s]/
-        group = parse_extended_whitespace
+        parse_extended_whitespace
       else
-        group = parse_single_char_group(next_char)
+        parse_single_char_group(next_char)
       end
-      group
     end
 
     def parse_repeater(group)
       case next_char
       when '*'
-        repeater = parse_star_repeater(group)
+        parse_star_repeater(group)
       when '+'
-        repeater = parse_plus_repeater(group)
+        parse_plus_repeater(group)
       when '?'
-        repeater = parse_question_mark_repeater(group)
+        parse_question_mark_repeater(group)
       when '{'
-        repeater = parse_range_repeater(group)
+        parse_range_repeater(group)
       else
-        repeater = parse_one_time_repeater(group)
+        parse_one_time_repeater(group)
       end
-      repeater
-    end
-
-    def parse_caret
-      if @current_position == 0
-        return PlaceHolderGroup.new # Ignore the "illegal" character
-      else
-        raise_anchors_exception!
-      end
-    end
-
-    def parse_dollar
-      if @current_position == (regexp_string.length - 1)
-        return PlaceHolderGroup.new # Ignore the "illegal" character
-      else
-        raise_anchors_exception!
-      end
-    end
-
-    def parse_extended_whitespace
-      if @extended
-        skip_whitespace
-        group = PlaceHolderGroup.new # Ignore the whitespace/comment
-      else
-        group = parse_single_char_group(next_char)
-      end
-      group
-    end
-
-    def skip_whitespace
-      whitespace_chars = rest_of_string.match(/#.*|\s+/)[0]
-      @current_position += whitespace_chars.length - 1
-    end
-
-    def parse_after_backslash_group
-      @current_position += 1
-      case
-      when rest_of_string =~ /\A(\d{1,3})/
-        @current_position += (Regexp.last_match(1).length - 1) # In case of 10+ backrefs!
-        group = parse_backreference_group(Regexp.last_match(1))
-      when rest_of_string =~ /\Ak['<]([\w-]+)['>]/ # Named capture group
-        @current_position += (Regexp.last_match(1).length + 2)
-        group_id = if Regexp.last_match(1).to_i < 0
-                     # RELATIVE group number, e.g. /(a)(b)(c)(d) \k<-2>/
-                     @num_groups + Regexp.last_match(1).to_i + 1
-                   else
-                     Regexp.last_match(1)
-                   end
-        group = parse_backreference_group(group_id)
-      when BackslashCharMap.keys.include?(next_char)
-        group = CharGroup.new(
-          BackslashCharMap[next_char].dup,
-          @ignorecase
-        )
-      when rest_of_string =~ /\A(c|C-)(.)/ # Control character
-        @current_position += Regexp.last_match(1).length
-        group = parse_single_char_group(parse_control_character(Regexp.last_match(2)))
-      when rest_of_string =~ /\Ax(\h{1,2})/ # Escape sequence
-        @current_position += Regexp.last_match(1).length
-        group = parse_single_char_group(parse_unicode_sequence(Regexp.last_match(1)))
-      when rest_of_string =~ /\Au(\h{4}|\{\h{1,4}\})/ # Unicode sequence
-        @current_position += Regexp.last_match(1).length
-        sequence = Regexp.last_match(1).match(/\h{1,4}/)[0] # Strip off "{" and "}"
-        group = parse_single_char_group(parse_unicode_sequence(sequence))
-      when rest_of_string =~ /\A(p)\{(\^?)([^}]+)\}/i # Named properties
-        @current_position += (Regexp.last_match(2).length + # 0 or 1, of '^' is present
-                              Regexp.last_match(3).length + # Length of the property name
-                              2) # Length of opening and closing brackets (always 2)
-        # Beware of double negatives! E.g. /\P{^Space}/
-        is_negative = (Regexp.last_match(1) == 'P') ^ (Regexp.last_match(2) == '^')
-        group = CharGroup.new(
-          if is_negative
-            CharSets::Any.dup - NamedPropertyCharMap[Regexp.last_match(3).downcase]
-          else
-            NamedPropertyCharMap[Regexp.last_match(3).downcase]
-          end,
-          @ignorecase
-        )
-      when next_char == 'K' # Keep (special lookbehind that CAN be supported safely!)
-        group = PlaceHolderGroup.new
-      when next_char == 'R' # Linebreak
-        group = CharGroup.new(
-          ["\r\n", "\n", "\v", "\f", "\r"],
-          @ignorecase
-        ) # Using "\r\n" as one character is little bit hacky...
-      when next_char == 'g' # Subexpression call
-        fail IllegalSyntaxError,
-             'Subexpression calls (\\g) cannot be supported, as they are not regular'
-      when next_char =~ /[bB]/ # Anchors
-        raise_anchors_exception!
-      when next_char =~ /[AG]/ # Start of string
-        if @current_position == 1
-          group = PlaceHolderGroup.new
-        else
-          raise_anchors_exception!
-        end
-      when next_char =~ /[zZ]/ # End of string
-        if @current_position == (regexp_string.length - 1)
-          # TODO: /\Z/ should be treated as /\n?/
-          group = PlaceHolderGroup.new
-        else
-          raise_anchors_exception!
-        end
-      else
-        group = parse_single_char_group(next_char)
-      end
-      group
-    end
-
-    def parse_multi_group
-      @current_position += 1
-      @num_groups += 1
-      remember_old_regexp_options do
-        group_id = nil # init
-        rest_of_string.match(
-          /
-          \A
-          (\?)?               # Is it a "special" group, i.e. starts with a "?"?
-            (
-              :               # Non capture group
-              |!              # Neglookahead
-              |=              # Lookahead
-              |\#             # Comment group
-              |<              # Lookbehind or named capture
-              (
-                !             # Neglookbehind
-                |=            # Lookbehind
-                |[^>]+        # Named capture
-              )
-              |[mix]*-?[mix]* # Option toggle
-            )?
-          /x
-        ) do |match|
-          case
-          when match[1].nil? # e.g. /(normal)/
-            group_id = @num_groups.to_s
-          when match[2] == ':' # e.g. /(?:nocapture)/
-            @current_position += 2
-          when match[2] == '#' # e.g. /(?#comment)/
-            comment_group = rest_of_string.match(/.*?[^\\](?:\\{2})*\)/)[0]
-            @current_position += comment_group.length
-          when match[2] =~ /\A(?=[mix-]+)([mix]*)-?([mix]*)/ # e.g. /(?i-mx)/
-            regexp_options_toggle(Regexp.last_match(1), Regexp.last_match(2))
-            @num_groups -= 1 # Toggle "groups" should not increase backref group count
-            @current_position += $&.length + 1
-            if next_char == ':' # e.g. /(?i:subexpr)/
-              @current_position += 1
-            else
-              return PlaceHolderGroup.new
-            end
-          when %w(! =).include?(match[2]) # e.g. /(?=lookahead)/, /(?!neglookahead)/
-            fail IllegalSyntaxError,
-                 'Lookaheads are not regular; cannot generate examples'
-          when %w(! =).include?(match[3]) # e.g. /(?<=lookbehind)/, /(?<!neglookbehind)/
-            fail IllegalSyntaxError,
-                 'Lookbehinds are not regular; cannot generate examples'
-          else # e.g. /(?<name>namedgroup)/
-            @current_position += (match[3].length + 3)
-            group_id = match[3]
-          end
-        end
-        MultiGroup.new(parse, group_id)
-      end
-    end
-
-    def remember_old_regexp_options
-      previous_ignorecase = @ignorecase
-      previous_multiline = @multiline
-      previous_extended = @extended
-      group = yield
-      @ignorecase = previous_ignorecase
-      @multiline = previous_multiline
-      @extended = previous_extended
-      group
-    end
-
-    def regexp_options_toggle(on, off)
-      regexp_option_toggle(on, off, '@ignorecase', 'i')
-      regexp_option_toggle(on, off, '@multiline', 'm')
-      regexp_option_toggle(on, off, '@extended', 'x')
-    end
-
-    def regexp_option_toggle(on, off, var, char)
-      instance_variable_set(var, true) if on.include? char
-      instance_variable_set(var, false) if off.include? char
-    end
-
-    def parse_char_group
-      @current_position += 1 # Skip past opening "["
-      chargroup_parser = ChargroupParser.new(rest_of_string)
-      parsed_chars = chargroup_parser.result
-      @current_position += (chargroup_parser.length - 1) # Step back to closing "]"
-      CharGroup.new(parsed_chars, @ignorecase)
-    end
-
-    def parse_dot_group
-      DotGroup.new(@multiline)
-    end
-
-    def parse_or_group(left_repeaters)
-      @current_position += 1
-      right_repeaters = parse
-      OrGroup.new(left_repeaters, right_repeaters)
-    end
-
-    def parse_single_char_group(char)
-      SingleCharGroup.new(char, @ignorecase)
-    end
-
-    def parse_backreference_group(group_id)
-      BackReferenceGroup.new(group_id)
-    end
-
-    def parse_control_character(char)
-      (char.ord % 32).chr # Black magic!
-      # eval "?\\C-#{char.chr}" # Doesn't work for e.g. char = "?"
-    end
-
-    def parse_unicode_sequence(match)
-      [match.to_i(16)].pack('U')
-    end
-
-    def parse_star_repeater(group)
-      @current_position += 1
-      parse_reluctant_or_possessive_repeater
-      StarRepeater.new(group)
-    end
-
-    def parse_plus_repeater(group)
-      @current_position += 1
-      parse_reluctant_or_possessive_repeater
-      PlusRepeater.new(group)
-    end
-
-    def parse_reluctant_or_possessive_repeater
-      if next_char =~ /[?+]/
-        # Don't treat these repeaters any differently when generating examples
-        @current_position += 1
-      end
-    end
-
-    def parse_question_mark_repeater(group)
-      @current_position += 1
-      parse_reluctant_or_possessive_repeater
-      QuestionMarkRepeater.new(group)
-    end
-
-    def parse_range_repeater(group)
-      match = rest_of_string.match(/\A\{(\d+)?(,)?(\d+)?\}/)
-      @current_position += match[0].size
-      min = match[1].to_i if match[1]
-      has_comma = !match[2].nil?
-      max = match[3].to_i if match[3]
-      repeater = RangeRepeater.new(group, min, has_comma, max)
-      parse_reluctant_or_possessive_range_repeater(repeater, min, has_comma, max)
-    end
-
-    def parse_reluctant_or_possessive_range_repeater(repeater, min, has_comma, max)
-      # .{1}? should be equivalent to (?:.{1})?, i.e. NOT a "non-greedy quantifier"
-      if min && !has_comma && !max && next_char == '?'
-        repeater = parse_question_mark_repeater(repeater)
-      else
-        parse_reluctant_or_possessive_repeater
-      end
-      repeater
-    end
-
-    def raise_anchors_exception!
-      fail IllegalSyntaxError,
-           "Anchors ('#{next_char}') cannot be supported, as they are not regular"
     end
 
     def parse_one_time_repeater(group)
       OneTimeRepeater.new(group)
     end