lib/regexp-examples/parser.rb in regexp-examples-1.1.3 vs lib/regexp-examples/parser.rb in regexp-examples-1.1.4
- old
+ new
@@ -1,9 +1,26 @@
+require_relative 'parser_helpers/parse_group_helper'
+require_relative 'parser_helpers/parse_after_backslash_group_helper'
+require_relative 'parser_helpers/parse_multi_group_helper'
+require_relative 'parser_helpers/parse_repeater_helper'
+require_relative 'parser_helpers/charset_negation_helper'
+
+# :nodoc:
module RegexpExamples
IllegalSyntaxError = Class.new(StandardError)
+ # A Regexp parser, used to build a structured collection of objects that represents
+ # the regular expression.
+ # This object can then be used to generate strings that match the regular expression.
class Parser
+ include ParseGroupHelper
+ include ParseAfterBackslashGroupHelper
+ include ParseMultiGroupHelper
+ include ParseRepeaterHelper
+ include CharsetNegationHelper
+
attr_reader :regexp_string
+
def initialize(regexp_string, regexp_options)
@regexp_string = regexp_string
@ignorecase = !(regexp_options & Regexp::IGNORECASE).zero?
@multiline = !(regexp_options & Regexp::MULTILINE).zero?
@extended = !(regexp_options & Regexp::EXTENDED).zero?
@@ -25,313 +42,42 @@
private
def parse_group(repeaters)
case next_char
when '('
- group = parse_multi_group
+ parse_multi_group
when '['
- group = parse_char_group
+ parse_char_group
when '.'
- group = parse_dot_group
+ parse_dot_group
when '|'
- group = parse_or_group(repeaters)
+ parse_or_group(repeaters)
when '\\'
- group = parse_after_backslash_group
+ parse_after_backslash_group
when '^'
- group = parse_caret
+ parse_caret
when '$'
- group = parse_dollar
+ parse_dollar
when /[#\s]/
- group = parse_extended_whitespace
+ parse_extended_whitespace
else
- group = parse_single_char_group(next_char)
+ parse_single_char_group(next_char)
end
- group
end
def parse_repeater(group)
case next_char
when '*'
- repeater = parse_star_repeater(group)
+ parse_star_repeater(group)
when '+'
- repeater = parse_plus_repeater(group)
+ parse_plus_repeater(group)
when '?'
- repeater = parse_question_mark_repeater(group)
+ parse_question_mark_repeater(group)
when '{'
- repeater = parse_range_repeater(group)
+ parse_range_repeater(group)
else
- repeater = parse_one_time_repeater(group)
+ parse_one_time_repeater(group)
end
- repeater
- end
-
- def parse_caret
- if @current_position == 0
- return PlaceHolderGroup.new # Ignore the "illegal" character
- else
- raise_anchors_exception!
- end
- end
-
- def parse_dollar
- if @current_position == (regexp_string.length - 1)
- return PlaceHolderGroup.new # Ignore the "illegal" character
- else
- raise_anchors_exception!
- end
- end
-
- def parse_extended_whitespace
- if @extended
- skip_whitespace
- group = PlaceHolderGroup.new # Ignore the whitespace/comment
- else
- group = parse_single_char_group(next_char)
- end
- group
- end
-
- def skip_whitespace
- whitespace_chars = rest_of_string.match(/#.*|\s+/)[0]
- @current_position += whitespace_chars.length - 1
- end
-
- def parse_after_backslash_group
- @current_position += 1
- case
- when rest_of_string =~ /\A(\d{1,3})/
- @current_position += (Regexp.last_match(1).length - 1) # In case of 10+ backrefs!
- group = parse_backreference_group(Regexp.last_match(1))
- when rest_of_string =~ /\Ak['<]([\w-]+)['>]/ # Named capture group
- @current_position += (Regexp.last_match(1).length + 2)
- group_id = if Regexp.last_match(1).to_i < 0
- # RELATIVE group number, e.g. /(a)(b)(c)(d) \k<-2>/
- @num_groups + Regexp.last_match(1).to_i + 1
- else
- Regexp.last_match(1)
- end
- group = parse_backreference_group(group_id)
- when BackslashCharMap.keys.include?(next_char)
- group = CharGroup.new(
- BackslashCharMap[next_char].dup,
- @ignorecase
- )
- when rest_of_string =~ /\A(c|C-)(.)/ # Control character
- @current_position += Regexp.last_match(1).length
- group = parse_single_char_group(parse_control_character(Regexp.last_match(2)))
- when rest_of_string =~ /\Ax(\h{1,2})/ # Escape sequence
- @current_position += Regexp.last_match(1).length
- group = parse_single_char_group(parse_unicode_sequence(Regexp.last_match(1)))
- when rest_of_string =~ /\Au(\h{4}|\{\h{1,4}\})/ # Unicode sequence
- @current_position += Regexp.last_match(1).length
- sequence = Regexp.last_match(1).match(/\h{1,4}/)[0] # Strip off "{" and "}"
- group = parse_single_char_group(parse_unicode_sequence(sequence))
- when rest_of_string =~ /\A(p)\{(\^?)([^}]+)\}/i # Named properties
- @current_position += (Regexp.last_match(2).length + # 0 or 1, of '^' is present
- Regexp.last_match(3).length + # Length of the property name
- 2) # Length of opening and closing brackets (always 2)
- # Beware of double negatives! E.g. /\P{^Space}/
- is_negative = (Regexp.last_match(1) == 'P') ^ (Regexp.last_match(2) == '^')
- group = CharGroup.new(
- if is_negative
- CharSets::Any.dup - NamedPropertyCharMap[Regexp.last_match(3).downcase]
- else
- NamedPropertyCharMap[Regexp.last_match(3).downcase]
- end,
- @ignorecase
- )
- when next_char == 'K' # Keep (special lookbehind that CAN be supported safely!)
- group = PlaceHolderGroup.new
- when next_char == 'R' # Linebreak
- group = CharGroup.new(
- ["\r\n", "\n", "\v", "\f", "\r"],
- @ignorecase
- ) # Using "\r\n" as one character is little bit hacky...
- when next_char == 'g' # Subexpression call
- fail IllegalSyntaxError,
- 'Subexpression calls (\\g) cannot be supported, as they are not regular'
- when next_char =~ /[bB]/ # Anchors
- raise_anchors_exception!
- when next_char =~ /[AG]/ # Start of string
- if @current_position == 1
- group = PlaceHolderGroup.new
- else
- raise_anchors_exception!
- end
- when next_char =~ /[zZ]/ # End of string
- if @current_position == (regexp_string.length - 1)
- # TODO: /\Z/ should be treated as /\n?/
- group = PlaceHolderGroup.new
- else
- raise_anchors_exception!
- end
- else
- group = parse_single_char_group(next_char)
- end
- group
- end
-
- def parse_multi_group
- @current_position += 1
- @num_groups += 1
- remember_old_regexp_options do
- group_id = nil # init
- rest_of_string.match(
- /
- \A
- (\?)? # Is it a "special" group, i.e. starts with a "?"?
- (
- : # Non capture group
- |! # Neglookahead
- |= # Lookahead
- |\# # Comment group
- |< # Lookbehind or named capture
- (
- ! # Neglookbehind
- |= # Lookbehind
- |[^>]+ # Named capture
- )
- |[mix]*-?[mix]* # Option toggle
- )?
- /x
- ) do |match|
- case
- when match[1].nil? # e.g. /(normal)/
- group_id = @num_groups.to_s
- when match[2] == ':' # e.g. /(?:nocapture)/
- @current_position += 2
- when match[2] == '#' # e.g. /(?#comment)/
- comment_group = rest_of_string.match(/.*?[^\\](?:\\{2})*\)/)[0]
- @current_position += comment_group.length
- when match[2] =~ /\A(?=[mix-]+)([mix]*)-?([mix]*)/ # e.g. /(?i-mx)/
- regexp_options_toggle(Regexp.last_match(1), Regexp.last_match(2))
- @num_groups -= 1 # Toggle "groups" should not increase backref group count
- @current_position += $&.length + 1
- if next_char == ':' # e.g. /(?i:subexpr)/
- @current_position += 1
- else
- return PlaceHolderGroup.new
- end
- when %w(! =).include?(match[2]) # e.g. /(?=lookahead)/, /(?!neglookahead)/
- fail IllegalSyntaxError,
- 'Lookaheads are not regular; cannot generate examples'
- when %w(! =).include?(match[3]) # e.g. /(?<=lookbehind)/, /(?<!neglookbehind)/
- fail IllegalSyntaxError,
- 'Lookbehinds are not regular; cannot generate examples'
- else # e.g. /(?<name>namedgroup)/
- @current_position += (match[3].length + 3)
- group_id = match[3]
- end
- end
- MultiGroup.new(parse, group_id)
- end
- end
-
- def remember_old_regexp_options
- previous_ignorecase = @ignorecase
- previous_multiline = @multiline
- previous_extended = @extended
- group = yield
- @ignorecase = previous_ignorecase
- @multiline = previous_multiline
- @extended = previous_extended
- group
- end
-
- def regexp_options_toggle(on, off)
- regexp_option_toggle(on, off, '@ignorecase', 'i')
- regexp_option_toggle(on, off, '@multiline', 'm')
- regexp_option_toggle(on, off, '@extended', 'x')
- end
-
- def regexp_option_toggle(on, off, var, char)
- instance_variable_set(var, true) if on.include? char
- instance_variable_set(var, false) if off.include? char
- end
-
- def parse_char_group
- @current_position += 1 # Skip past opening "["
- chargroup_parser = ChargroupParser.new(rest_of_string)
- parsed_chars = chargroup_parser.result
- @current_position += (chargroup_parser.length - 1) # Step back to closing "]"
- CharGroup.new(parsed_chars, @ignorecase)
- end
-
- def parse_dot_group
- DotGroup.new(@multiline)
- end
-
- def parse_or_group(left_repeaters)
- @current_position += 1
- right_repeaters = parse
- OrGroup.new(left_repeaters, right_repeaters)
- end
-
- def parse_single_char_group(char)
- SingleCharGroup.new(char, @ignorecase)
- end
-
- def parse_backreference_group(group_id)
- BackReferenceGroup.new(group_id)
- end
-
- def parse_control_character(char)
- (char.ord % 32).chr # Black magic!
- # eval "?\\C-#{char.chr}" # Doesn't work for e.g. char = "?"
- end
-
- def parse_unicode_sequence(match)
- [match.to_i(16)].pack('U')
- end
-
- def parse_star_repeater(group)
- @current_position += 1
- parse_reluctant_or_possessive_repeater
- StarRepeater.new(group)
- end
-
- def parse_plus_repeater(group)
- @current_position += 1
- parse_reluctant_or_possessive_repeater
- PlusRepeater.new(group)
- end
-
- def parse_reluctant_or_possessive_repeater
- if next_char =~ /[?+]/
- # Don't treat these repeaters any differently when generating examples
- @current_position += 1
- end
- end
-
- def parse_question_mark_repeater(group)
- @current_position += 1
- parse_reluctant_or_possessive_repeater
- QuestionMarkRepeater.new(group)
- end
-
- def parse_range_repeater(group)
- match = rest_of_string.match(/\A\{(\d+)?(,)?(\d+)?\}/)
- @current_position += match[0].size
- min = match[1].to_i if match[1]
- has_comma = !match[2].nil?
- max = match[3].to_i if match[3]
- repeater = RangeRepeater.new(group, min, has_comma, max)
- parse_reluctant_or_possessive_range_repeater(repeater, min, has_comma, max)
- end
-
- def parse_reluctant_or_possessive_range_repeater(repeater, min, has_comma, max)
- # .{1}? should be equivalent to (?:.{1})?, i.e. NOT a "non-greedy quantifier"
- if min && !has_comma && !max && next_char == '?'
- repeater = parse_question_mark_repeater(repeater)
- else
- parse_reluctant_or_possessive_repeater
- end
- repeater
- end
-
- def raise_anchors_exception!
- fail IllegalSyntaxError,
- "Anchors ('#{next_char}') cannot be supported, as they are not regular"
end
def parse_one_time_repeater(group)
OneTimeRepeater.new(group)
end