lib/regexp-examples/chargroup_parser.rb in regexp-examples-0.7.0 vs lib/regexp-examples/chargroup_parser.rb in regexp-examples-1.0.0
- old
+ new
@@ -1,70 +1,119 @@
module RegexpExamples
- # Given an array of chars from inside a character set,
- # Interprets all backslashes, ranges and negations
- # TODO: This needs a bit of a rewrite because:
- # A) It's ugly
- # B) It doesn't take into account nested character groups, or set intersection
- # To achieve this, the algorithm needs to be recursive, like the main Parser.
+ # A "sub-parser", for char groups in a regular expression
+ # Some examples of what this class needs to parse:
+ # [abc] - plain characters
+ # [a-z] - ranges
+ # [\n\b\d] - escaped characters (which may represent character sets)
+ # [^abc] - negated group
+ # [[a][bc]] - sub-groups (should match "a", "b" or "c")
+ # [[:lower:]] - POSIX group
+ # [[a-f]&&[d-z]] - set intersection (should match "d", "f" or "f")
+ # [[^:alpha:]&&[\n]a-c] - all of the above!!!! (should match "\n")
class ChargroupParser
- def initialize(chars)
- @chars = chars
- if @chars[0] == "^"
- @negative = true
- @chars = @chars[1..-1]
- else
- @negative = false
+ attr_reader :regexp_string
+ def initialize(regexp_string, is_sub_group: false)
+ @regexp_string = regexp_string
+ @is_sub_group = is_sub_group
+ @current_position = 0
+ parse
+ end
+
+ def parse
+ @charset = []
+ @negative = false
+ parse_first_chars
+ until next_char == "]" do
+ case next_char
+ when "["
+ @current_position += 1
+ sub_group_parser = self.class.new(rest_of_string, is_sub_group: true)
+ @charset.concat sub_group_parser.result
+ @current_position += sub_group_parser.length
+ when "-"
+ if regexp_string[@current_position + 1] == "]" # e.g. /[abc-]/ -- not a range!
+ @charset << "-"
+ @current_position += 1
+ else
+ @current_position += 1
+ @charset.concat (@charset.last .. parse_checking_backlash.first).to_a
+ @current_position += 1
+ end
+ when "&"
+ if regexp_string[@current_position + 1] == "&"
+ @current_position += 2
+ sub_group_parser = self.class.new(rest_of_string, is_sub_group: @is_sub_group)
+ @charset &= sub_group_parser.result
+ @current_position += (sub_group_parser.length - 1)
+ else
+ @charset << "&"
+ @current_position += 1
+ end
+ else
+ @charset.concat parse_checking_backlash
+ @current_position += 1
+ end
end
- init_backslash_chars
- init_ranges
+ @charset.uniq!
+ @current_position += 1 # To account for final "]"
end
+ def length
+ @current_position
+ end
+
def result
- @negative ? (CharSets::Any - @chars) : @chars
+ @negative ? (CharSets::Any - @charset) : @charset
end
private
- def init_backslash_chars
- @chars.each_with_index do |char, i|
- if char == "\\"
- if BackslashCharMap.keys.include?(@chars[i+1])
- @chars[i..i+1] = move_backslash_to_front( BackslashCharMap[@chars[i+1]] )
- elsif @chars[i+1] == 'b'
- @chars[i..i+1] = "\b"
- elsif @chars[i+1] == "\\"
- @chars.delete_at(i+1)
- else
- @chars.delete_at(i)
- end
+ def parse_first_chars
+ if next_char == '^'
+ @negative = true
+ @current_position += 1
+ end
+
+ case rest_of_string
+ when /\A[-\]]/ # e.g. /[]]/ (match "]") or /[-]/ (match "-")
+ @charset << next_char
+ @current_position += 1
+ when /\A:(\^?)([^:]+):\]/ # e.g. [[:alpha:]] - POSIX group
+ if @is_sub_group
+ chars = $1.empty? ? POSIXCharMap[$2] : (CharSets::Any - POSIXCharMap[$2])
+ @charset.concat chars
+ @current_position += ($1.length + $2.length + 2)
end
end
end
- def init_ranges
- # remove hyphen ("-") from front/back, if present
- hyphen = nil
- hyphen = @chars.shift if @chars.first == "-"
- hyphen ||= @chars.pop if @chars.last == "-"
- # Replace all instances of e.g. ["a", "-", "z"] with ["a", "b", ..., "z"]
- while i = @chars.index("-")
- # Prevent infinite loops from expanding [",", "-", "."] to itself
- # (Since ",".ord = 44, "-".ord = 45, ".".ord = 46)
- if (@chars[i-1] == ',' && @chars[i+1] == '.')
- hyphen = @chars.delete_at(i)
- else
- @chars[i-1..i+1] = (@chars[i-1]..@chars[i+1]).to_a
- end
+ # Always returns an Array, for consistency
+ def parse_checking_backlash
+ if next_char == "\\"
+ @current_position += 1
+ parse_after_backslash
+ else
+ [next_char]
end
- # restore hyphen, if stripped out earlier
- @chars.unshift(hyphen) if hyphen
end
- def move_backslash_to_front(chars)
- if index = chars.index { |char| char == '\\' }
- chars.unshift chars.delete_at(index)
+ def parse_after_backslash
+ case next_char
+ when *BackslashCharMap.keys
+ BackslashCharMap[next_char]
+ when 'b'
+ ["\b"]
+ else
+ [next_char]
end
- chars
+ end
+
+ def rest_of_string
+ regexp_string[@current_position..-1]
+ end
+
+ def next_char
+ regexp_string[@current_position]
end
end
end