module RegexpExamples # A "sub-parser", for char groups in a regular expression # Some examples of what this class needs to parse: # [abc] - plain characters # [a-z] - ranges # [\n\b\d] - escaped characters (which may represent character sets) # [^abc] - negated group # [[a][bc]] - sub-groups (should match "a", "b" or "c") # [[:lower:]] - POSIX group # [[a-f]&&[d-z]] - set intersection (should match "d", "f" or "f") # [[^:alpha:]&&[\n]a-c] - all of the above!!!! (should match "\n") class ChargroupParser attr_reader :regexp_string def initialize(regexp_string, is_sub_group: false) @regexp_string = regexp_string @is_sub_group = is_sub_group @current_position = 0 parse end def parse @charset = [] @negative = false parse_first_chars until next_char == ']' case next_char when '[' @current_position += 1 sub_group_parser = self.class.new(rest_of_string, is_sub_group: true) @charset.concat sub_group_parser.result @current_position += sub_group_parser.length when '-' if regexp_string[@current_position + 1] == ']' # e.g. /[abc-]/ -- not a range! @charset << '-' @current_position += 1 else @current_position += 1 @charset.concat (@charset.last..parse_checking_backlash.first).to_a @current_position += 1 end when '&' if regexp_string[@current_position + 1] == '&' @current_position += 2 sub_group_parser = self.class.new(rest_of_string, is_sub_group: @is_sub_group) @charset &= sub_group_parser.result @current_position += (sub_group_parser.length - 1) else @charset << '&' @current_position += 1 end else @charset.concat parse_checking_backlash @current_position += 1 end end @charset.uniq! @current_position += 1 # To account for final "]" end def length @current_position end def result @negative ? (CharSets::Any - @charset) : @charset end private def parse_first_chars if next_char == '^' @negative = true @current_position += 1 end case rest_of_string when /\A[-\]]/ # e.g. /[]]/ (match "]") or /[-]/ (match "-") @charset << next_char @current_position += 1 when /\A:(\^?)([^:]+):\]/ # e.g. [[:alpha:]] - POSIX group if @is_sub_group chars = Regexp.last_match(1).empty? ? POSIXCharMap[Regexp.last_match(2)] : (CharSets::Any - POSIXCharMap[Regexp.last_match(2)]) @charset.concat chars @current_position += (Regexp.last_match(1).length + Regexp.last_match(2).length + 2) end end end # Always returns an Array, for consistency def parse_checking_backlash if next_char == '\\' @current_position += 1 parse_after_backslash else [next_char] end end def parse_after_backslash case next_char when *BackslashCharMap.keys BackslashCharMap[next_char] when 'b' ["\b"] else [next_char] end end def rest_of_string regexp_string[@current_position..-1] end def next_char regexp_string[@current_position] end end end