-
10
Dir[File.dirname(__FILE__) + '/regexp-examples/*.rb'].each {|file| require file }
-
-
1
module RegexpExamples
-
1
class BackReferenceReplacer
-
1
def substitute_backreferences(full_examples)
-
75
full_examples.map do |full_example|
-
171
while full_example.match(/__(\w+?)__/)
-
28
full_example.sub!(/__(\w+?)__/, find_backref_for(full_example, $1))
-
end
-
171
full_example
-
end
-
end
-
-
1
private
-
1
def find_backref_for(full_example, group_id)
-
28
full_example.all_subgroups.detect do |subgroup|
-
76
subgroup.group_id == group_id
-
end
-
end
-
-
end
-
-
end
-
1
module RegexpExamples
-
# Number of times to repeat for Star and Plus repeaters
-
1
TIMES = 2
-
-
# Maximum number of characters returned from a char set, to reduce output spam
-
# For example:
-
# If MaxGroupResults = 5, then
-
# \d = [0, 1, 2, 3, 4]
-
1
MaxGroupResults = 5
-
-
1
module CharSets
-
1
Lower = Array('a'..'z')
-
1
Upper = Array('A'..'Z')
-
1
Digit = Array('0'..'9')
-
# 45.chr = "-". Need to make sure this is at the START of the array, or things break
-
# This is because of the /[a-z]/ regex syntax, and how it's being parsed
-
39
Punct = [45..45, 33..44, 46..47, 58..64, 91..96, 123..126].map { |r| r.map { |val| val.chr } }.flatten
-
1
Hex = Array('a'..'f') | Array('A'..'F') | Digit
-
1
Any = Lower | Upper | Digit | Punct
-
end
-
-
# Map of special regex characters, to their associated character sets
-
1
BackslashCharMap = {
-
'd' => CharSets::Digit,
-
'D' => CharSets::Lower | CharSets::Upper | CharSets::Punct,
-
'w' => CharSets::Lower | CharSets::Upper | CharSets::Digit | ['_'],
-
32
'W' => CharSets::Punct.reject { |val| val == '_' },
-
's' => [' ', "\t", "\n", "\r", "\v", "\f"],
-
'S' => CharSets::Any - [' ', "\t", "\n", "\r", "\v", "\f"],
-
'h' => CharSets::Hex,
-
'H' => CharSets::Any - CharSets::Hex,
-
-
't' => ["\t"], # tab
-
'n' => ["\n"], # new line
-
'r' => ["\r"], # carriage return
-
'f' => ["\f"], # form feed
-
'a' => ["\a"], # alarm
-
'v' => ["\v"], # vertical tab
-
'e' => ["\e"], # escape
-
}
-
end
-
-
1
module RegexpExamples
-
1
class Error < StandardError; end
-
1
class UnsupportedSyntaxError < Error; end
-
1
class IllegalSyntaxError < Error; end
-
end
-
1
module RegexpExamples
-
# All Group#result methods return an array of GroupResult objects
-
# The key objective here is to keep track of all capture groups, in order
-
# to fill in backreferences
-
1
class GroupResult < String
-
1
attr_reader :group_id, :subgroups
-
1
def initialize(result, group_id = nil, subgroups = [])
-
1758
@group_id = group_id
-
1758
@subgroups = subgroups
-
1758
if result.respond_to?(:group_id)
-
122
@subgroups = result.all_subgroups
-
end
-
1758
super(result)
-
end
-
-
1
def all_subgroups
-
1775
[self, subgroups].flatten.reject { |subgroup| subgroup.group_id.nil? }
-
end
-
-
# Overridden in order to preserve the @group_id and @subgroups
-
# Used by BaseGroup (which, in turn, is used by all Group objects)
-
1
def *(int)
-
463
self.class.new(super.to_s, group_id, subgroups)
-
end
-
end
-
-
1
class SingleCharGroup
-
1
def initialize(char)
-
261
@char = char
-
end
-
1
def result
-
223
[GroupResult.new(@char)]
-
end
-
end
-
-
1
class CharGroup
-
1
def initialize(chars)
-
28
@chars = chars
-
28
if chars[0] == "^"
-
1
@negative = true
-
1
@chars = @chars[1..-1]
-
else
-
27
@negative = false
-
end
-
-
28
init_backslash_chars
-
28
init_ranges
-
end
-
-
1
def init_ranges
-
# save first and last "-" if present
-
28
first = nil
-
28
last = nil
-
28
first = @chars.shift if @chars.first == "-"
-
28
last = @chars.pop if @chars.last == "-"
-
# Replace all instances of e.g. ["a" "-" "z"] with ["a", "b", ..., "z"]
-
28
while i = @chars.index("-")
-
9
@chars[i-1..i+1] = (@chars[i-1]..@chars[i+1]).to_a
-
end
-
# restore them back
-
28
@chars.unshift(first) if first
-
28
@chars.push(last) if last
-
end
-
-
1
def init_backslash_chars
-
28
@chars.each_with_index do |char, i|
-
540
if char == "\\"
-
13
if BackslashCharMap.keys.include?(@chars[i+1])
-
4
@chars[i..i+1] = BackslashCharMap[@chars[i+1]]
-
elsif @chars[i+1] == "\\"
-
2
@chars.delete_at(i+1)
-
else
-
7
@chars.delete_at(i)
-
end
-
end
-
end
-
end
-
-
1
def result
-
28
(@negative ? (CharSets::Any - @chars) : @chars).map do |result|
-
572
GroupResult.new(result)
-
end
-
end
-
end
-
-
1
class DotGroup
-
1
def result
-
1
CharSets::Any.map do |result|
-
94
GroupResult.new(result)
-
end
-
end
-
end
-
-
1
class MultiGroup
-
1
attr_reader :group_id
-
1
def initialize(groups, group_id)
-
41
@groups = groups
-
41
@group_id = group_id
-
end
-
-
# Generates the result of each contained group
-
# and adds the filled group of each result to
-
# itself
-
1
def result
-
186
strings = @groups.map {|repeater| repeater.result}
-
41
RegexpExamples::permutations_of_strings(strings).map do |result|
-
101
GroupResult.new(result, group_id)
-
end
-
end
-
end
-
-
1
class MultiGroupEnd
-
end
-
-
1
class OrGroup
-
1
def initialize(left_repeaters, right_repeaters)
-
6
@left_repeaters = left_repeaters
-
6
@right_repeaters = right_repeaters
-
end
-
-
1
def result
-
6
left_result = @left_repeaters.map do |repeater|
-
6
RegexpExamples::permutations_of_strings([repeater.result])
-
end
-
6
right_result = @right_repeaters.map do |repeater|
-
6
RegexpExamples::permutations_of_strings([repeater.result])
-
end
-
6
left_result.concat(right_result).flatten.uniq.map do |result|
-
21
GroupResult.new(result)
-
end
-
end
-
end
-
-
1
class BackReferenceGroup
-
1
attr_reader :id
-
1
def initialize(id)
-
20
@id = id
-
end
-
-
1
def result
-
20
[ GroupResult.new("__#{@id}__") ]
-
end
-
end
-
-
end
-
1
module RegexpExamples
-
# Given an array of arrays of strings,
-
# returns all possible perutations,
-
# for strings created by joining one
-
# element from each array
-
#
-
# For example:
-
# permutations_of_strings [ ['a'], ['b'], ['c', 'd', 'e'] ] #=> ['acb', 'abd', 'abe']
-
# permutations_of_strings [ ['a', 'b'], ['c', 'd'] ] #=> [ 'ac', 'ad', 'bc', 'bd' ]
-
1
def self.permutations_of_strings(arrays_of_strings, options={})
-
319
first = arrays_of_strings.shift
-
319
return first if arrays_of_strings.empty?
-
191
first.product( permutations_of_strings(arrays_of_strings, options) ).map do |result|
-
264
join_preserving_capture_groups(result)
-
end
-
end
-
-
1
def self.join_preserving_capture_groups(result)
-
264
result.flatten!
-
264
subgroups = result
-
.map(&:all_subgroups)
-
.flatten
-
264
GroupResult.new(result.join, nil, subgroups)
-
end
-
end
-
-
1
module RegexpExamples
-
1
class Parser
-
1
attr_reader :regexp_string
-
1
def initialize(regexp_string)
-
92
@regexp_string = regexp_string
-
92
@num_groups = 0
-
92
@current_position = 0
-
end
-
-
1
def parse
-
140
repeaters = []
-
140
while @current_position < regexp_string.length
-
419
group = parse_group(repeaters)
-
401
break if group.is_a? MultiGroupEnd
-
360
repeaters = [] if group.is_a? OrGroup
-
360
@current_position += 1
-
360
repeaters << parse_repeater(group)
-
end
-
122
repeaters
-
end
-
-
1
private
-
-
1
def parse_group(repeaters)
-
419
char = regexp_string[@current_position]
-
419
case char
-
when '('
-
46
group = parse_multi_group
-
when ')'
-
41
group = parse_multi_end_group
-
when '['
-
15
group = parse_char_group
-
when '.'
-
4
group = parse_dot_group
-
when '|'
-
6
group = parse_or_group(repeaters)
-
when '\\'
-
63
group = parse_after_backslash_group
-
when '^', 'A'
-
2
if @current_position == 0
-
1
group = parse_single_char_group('') # Ignore the "illegal" character
-
else
-
1
raise IllegalSyntaxError, "Anchors cannot be supported, as they are not regular"
-
end
-
when '$', 'z', 'Z'
-
2
if @current_position == (regexp_string.length - 1)
-
1
group = parse_single_char_group('') # Ignore the "illegal" character
-
else
-
1
raise IllegalSyntaxError, "Anchors cannot be supported, as they are not regular"
-
end
-
else
-
240
group = parse_single_char_group(char)
-
end
-
401
group
-
end
-
-
1
def parse_after_backslash_group
-
63
@current_position += 1
-
case
-
when rest_of_string =~ /\A(\d+)/
-
19
@current_position += ($1.length - 1) # In case of 10+ backrefs!
-
19
group = parse_backreference_group($1)
-
when rest_of_string =~ /\Ak<([^>]+)>/ # Named capture group
-
1
@current_position += ($1.length + 2)
-
1
group = parse_backreference_group($1)
-
when BackslashCharMap.keys.include?(regexp_string[@current_position])
-
14
group = CharGroup.new(
-
BackslashCharMap[regexp_string[@current_position]])
-
when rest_of_string =~ /\A(c|C-)(.)/ # Control character
-
8
@current_position += $1.length
-
8
group = parse_single_char_group( parse_control_character($2) )
-
when rest_of_string =~ /\Ax(\h{1,2})/ # Escape sequence
-
3
@current_position += $1.length
-
3
group = parse_single_char_group( parse_escape_sequence($1) )
-
when rest_of_string =~ /\Au(\h{4}|\{\h{1,4}\})/ # Unicode sequence
-
3
@current_position += $1.length
-
3
sequence = $1.match(/\h{1,4}/)[0] # Strip off "{" and "}"
-
3
group = parse_single_char_group( parse_unicode_sequence(sequence) )
-
when rest_of_string =~ /\Ap\{([^}]+)\}/ # Named properties
-
3
@current_position += ($1.length + 2)
-
3
raise UnsupportedSyntaxError, "Named properties ({\\p#{$1}}) are not yet supported"
-
when rest_of_string =~ /\Ag/ # Subexpression call
-
# TODO: Should this be IllegalSyntaxError ?
-
1
raise UnsupportedSyntaxError, "Subexpression calls (\g) are not yet supported"
-
when rest_of_string =~ /\A[GbB]/ # Anchors
-
3
raise IllegalSyntaxError, "Anchors cannot be supported, as they are not regular"
-
when rest_of_string =~ /\AA/ # Start of string
-
2
if @current_position == 1
-
1
group = parse_single_char_group('') # Ignore the "illegal" character
-
else
-
1
raise IllegalSyntaxError, "Anchors cannot be supported, as they are not regular"
-
end
-
when rest_of_string =~ /\A[zZ]/ # End of string
-
4
if @current_position == (regexp_string.length - 1)
-
2
group = parse_single_char_group('') # Ignore the "illegal" character
-
else
-
2
raise IllegalSyntaxError, "Anchors cannot be supported, as they are not regular"
-
end
-
else
-
2
group = parse_single_char_group( regexp_string[@current_position] )
-
# TODO: What about cases like \A, \z, \Z ?
-
63
end
-
53
group
-
end
-
-
1
def parse_repeater(group)
-
360
char = regexp_string[@current_position]
-
360
case char
-
when '*'
-
5
repeater = parse_star_repeater(group)
-
when '+'
-
4
repeater = parse_plus_repeater(group)
-
when '?'
-
11
repeater = parse_question_mark_repeater(group)
-
when '{'
-
5
repeater = parse_range_repeater(group)
-
else
-
335
repeater = parse_one_time_repeater(group)
-
end
-
360
repeater
-
end
-
-
1
def parse_multi_group
-
46
@current_position += 1
-
46
@num_groups += 1
-
46
group_id = nil # init
-
46
rest_of_string.match(/\A(\?)?(:|!|=|<(!|=|[^!=][^>]*))?/) do |match|
-
case
-
when match[1].nil? # e.g. /(normal)/
-
38
group_id = @num_groups.to_s
-
when match[2] == ':' # e.g. /(?:nocapture)/
-
1
@current_position += 2
-
1
group_id = nil
-
when %w(! =).include?(match[2]) # e.g. /(?=lookahead)/, /(?!neglookahead)/
-
2
raise IllegalSyntaxError, "Lookaheads are not regular; cannot generate examples"
-
when %w(! =).include?(match[3]) # e.g. /(?<=lookbehind)/, /(?<!neglookbehind)/
-
2
raise IllegalSyntaxError, "Lookbehinds are not regular; cannot generate examples"
-
else # e.g. /(?<name>namedgroup)/
-
3
@current_position += (match[3].length + 3)
-
3
group_id = match[3]
-
46
end
-
end
-
42
groups = parse
-
41
MultiGroup.new(groups, group_id)
-
end
-
-
1
def parse_multi_end_group
-
41
MultiGroupEnd.new
-
end
-
-
1
def parse_char_group
-
15
if rest_of_string =~ /\A\[\[:[^:]+:\]\]/
-
1
raise UnsupportedSyntaxError, "POSIX bracket expressions are not yet implemented"
-
end
-
14
chars = []
-
14
@current_position += 1
-
14
if regexp_string[@current_position] == ']'
-
# Beware of the sneaky edge case:
-
# /[]]/ (match "]")
-
1
chars << ']'
-
1
@current_position += 1
-
end
-
until regexp_string[@current_position] == ']' \
-
14
&& !regexp_string[0..@current_position-1].match(/[^\\](\\{2})*\\\z/)
-
# Beware of having an ODD number of "\" before the "]", e.g.
-
# /[\]]/ (match "]")
-
# /[\\]/ (match "\")
-
# /[\\\]]/ (match "\" or "]")
-
41
chars << regexp_string[@current_position]
-
41
@current_position += 1
-
end
-
14
CharGroup.new(chars)
-
end
-
-
1
def parse_dot_group
-
4
DotGroup.new
-
end
-
-
1
def parse_or_group(left_repeaters)
-
6
@current_position += 1
-
6
right_repeaters = parse
-
6
OrGroup.new(left_repeaters, right_repeaters)
-
end
-
-
-
1
def parse_single_char_group(char)
-
261
SingleCharGroup.new(char)
-
end
-
-
1
def parse_backreference_group(match)
-
20
BackReferenceGroup.new(match)
-
end
-
-
1
def parse_control_character(char)
-
8
(char.ord % 32).chr # Black magic!
-
# eval "?\\C-#{char.chr}" # Doesn't work for e.g. char = "?"
-
end
-
-
1
def parse_escape_sequence(match)
-
3
eval "?\\x#{match}"
-
end
-
-
1
def parse_unicode_sequence(match)
-
3
eval "?\\u{#{match}}"
-
end
-
-
1
def parse_star_repeater(group)
-
5
@current_position += 1
-
5
StarRepeater.new(group)
-
end
-
-
1
def parse_plus_repeater(group)
-
4
@current_position += 1
-
4
PlusRepeater.new(group)
-
end
-
-
1
def parse_question_mark_repeater(group)
-
11
@current_position += 1
-
11
QuestionMarkRepeater.new(group)
-
end
-
-
1
def parse_range_repeater(group)
-
5
match = rest_of_string.match(/\A\{(\d+)?(,)?(\d+)?\}/)
-
5
@current_position += match[0].size
-
5
min = match[1].to_i if match[1]
-
5
has_comma = !match[2].nil?
-
5
max = match[3].to_i if match[3]
-
5
RangeRepeater.new(group, min, has_comma, max)
-
end
-
-
1
def parse_one_time_repeater(group)
-
335
OneTimeRepeater.new(group)
-
end
-
-
1
def rest_of_string
-
293
regexp_string[@current_position..-1]
-
end
-
end
-
end
-
-
1
class Regexp
-
1
module Examples
-
1
def examples
-
92
partial_examples =
-
RegexpExamples::Parser.new(source)
-
.parse
-
162
.map {|repeater| repeater.result}
-
75
full_examples = RegexpExamples::permutations_of_strings(partial_examples)
-
75
RegexpExamples::BackReferenceReplacer.new.substitute_backreferences(full_examples)
-
end
-
end
-
1
include Examples
-
end
-
-
1
module RegexpExamples
-
1
class BaseRepeater
-
1
attr_reader :group
-
1
def initialize(group)
-
360
@group = group
-
end
-
-
1
def result(min_repeats, max_repeats)
-
319
group_results = @group.result[0 .. MaxGroupResults-1]
-
319
results = []
-
319
min_repeats.upto(max_repeats) do |repeats|
-
349
group_results.each do |group_result|
-
463
results << group_result * repeats
-
end
-
end
-
319
results.uniq
-
end
-
end
-
-
1
class OneTimeRepeater < BaseRepeater
-
1
def initialize(group)
-
335
super
-
end
-
-
1
def result
-
294
super(1, 1)
-
end
-
end
-
-
1
class StarRepeater < BaseRepeater
-
1
def initialize(group)
-
5
super
-
end
-
-
1
def result
-
5
super(0, TIMES)
-
end
-
end
-
-
1
class PlusRepeater < BaseRepeater
-
1
def initialize(group)
-
4
super
-
end
-
-
1
def result
-
4
super(1, TIMES)
-
end
-
end
-
-
1
class QuestionMarkRepeater < BaseRepeater
-
1
def initialize(group)
-
11
super
-
end
-
-
1
def result
-
11
super(0, 1)
-
end
-
end
-
-
1
class RangeRepeater < BaseRepeater
-
1
def initialize(group, min, has_comma, max)
-
5
super(group)
-
5
@min = min || 0
-
5
if max
-
2
@max = max
-
3
elsif has_comma
-
1
@max = min + TIMES
-
else
-
2
@max = min
-
end
-
end
-
-
1
def result
-
5
super(@min, @max)
-
end
-
end
-
end
-
-
1
RSpec.describe Regexp, "#examples" do
-
1
def self.examples_exist_and_match(*regexps)
-
11
regexps.each do |regexp|
-
75
it do
-
75
regexp_examples = regexp.examples
-
75
expect(regexp_examples).not_to be_empty
-
246
regexp_examples.each { |example| expect(example).to match(/\A(?:#{regexp.source})\z/) }
-
# Note: /\A...\z/ is used, to prevent misleading examples from passing the test.
-
# For example, we don't want things like:
-
# /a*/.examples to include "xyz"
-
# /a|b/.examples to include "bad"
-
end
-
end
-
end
-
-
1
def self.examples_raise_illegal_syntax_error(*regexps)
-
1
regexps.each do |regexp|
-
12
it do
-
24
expect{regexp.examples}.to raise_error RegexpExamples::IllegalSyntaxError
-
end
-
end
-
end
-
-
1
def self.examples_raise_unsupported_syntax_error(*regexps)
-
1
regexps.each do |regexp|
-
5
it do
-
10
expect{regexp.examples}.to raise_error RegexpExamples::UnsupportedSyntaxError
-
end
-
end
-
end
-
-
1
context 'returns matching strings' do
-
1
context "for basic repeaters" do
-
1
examples_exist_and_match(
-
/a/,
-
/a*/,
-
/a+/,
-
/a?/,
-
/a{1}/,
-
/a{1,}/,
-
/a{,2}/,
-
/a{1,2}/
-
)
-
end
-
-
1
context "for basic groups" do
-
1
examples_exist_and_match(
-
/[a]/,
-
/(a)/,
-
/a|b/,
-
/./
-
)
-
end
-
-
1
context "for complex char groups (square brackets)" do
-
1
examples_exist_and_match(
-
/[abc]/,
-
/[a-c]/,
-
/[abc-e]/,
-
/[^a-zA-Z]/,
-
/[\w]/,
-
/[]]/, # TODO: How to suppress annoying warnings on this test?
-
/[\]]/,
-
/[\\]/,
-
/[\\\]]/,
-
/[\n-\r]/,
-
/[\-]/,
-
/[%-+]/ # This regex is "supposed to" match some surprising things!!!
-
)
-
end
-
-
1
context "for complex multi groups" do
-
1
examples_exist_and_match(
-
/(normal)/,
-
/(?:nocapture)/,
-
/(?<name>namedgroup)/,
-
/(?<name>namedgroup) \k<name>/
-
)
-
end
-
-
1
context "for escaped characters" do
-
1
examples_exist_and_match(
-
/\w/,
-
/\W/,
-
/\s/,
-
/\S/,
-
/\d/,
-
/\D/,
-
/\h/,
-
/\H/,
-
/\t/,
-
/\n/,
-
/\f/,
-
/\a/,
-
/\v/,
-
/\e/
-
)
-
end
-
-
1
context "for backreferences" do
-
1
examples_exist_and_match(
-
/(repeat) \1/,
-
/(ref1) (ref2) \1 \2/,
-
/((ref2)ref1) \1 \2/,
-
/((ref1and2)) \1 \2/,
-
/(one)(two)(three)(four)(five)(six)(seven)(eight)(nine)(ten) \10\9\8\7\6\5\4\3\2\1/,
-
/(a?(b?(c?(d?(e?)))))/
-
)
-
end
-
-
1
context "for complex patterns" do
-
# Longer combinations of the above
-
1
examples_exist_and_match(
-
/https?:\/\/(www\.)github\.com/,
-
/(I(N(C(E(P(T(I(O(N)))))))))*/,
-
/[\w]{1}/,
-
/((a?b*c+)) \1/,
-
/((a?b*c+)?) \1/,
-
/a|b|c|d/,
-
/a+|b*|c?/
-
)
-
end
-
-
1
context "for illegal syntax" do
-
1
examples_raise_illegal_syntax_error(
-
/(?=lookahead)/,
-
/(?!neglookahead)/,
-
/(?<=lookbehind)/,
-
/(?<!neglookbehind)/,
-
/\bword-boundary/,
-
/no\Bn-word-boundary/,
-
/\Glast-match/,
-
/start-of\A-string/,
-
/start-of^-line/,
-
/end-of\Z-string/,
-
/end-of\z-string/,
-
/end-of$-line/
-
)
-
end
-
-
1
context "ignore start/end anchors if at start/end" do
-
1
examples_exist_and_match(
-
/\Astart/,
-
/^start/,
-
/end$/,
-
/end\z/,
-
/end\Z/
-
)
-
end
-
-
1
context "for unsupported syntax" do
-
1
examples_raise_unsupported_syntax_error(
-
/\p{L}/,
-
/\p{Arabic}/,
-
/\p{^Ll}/,
-
/(?<name> ... \g<name>*)/,
-
/[[:space:]]/
-
)
-
end
-
-
1
context "for control characters" do
-
1
examples_exist_and_match(
-
/\ca/,
-
/\cZ/,
-
/\c9/,
-
/\c[/,
-
/\c#/,
-
/\c?/,
-
/\C-a/,
-
/\C-&/
-
)
-
end
-
-
1
context "for escape sequences" do
-
1
examples_exist_and_match(
-
/\x42/,
-
/\x1D/,
-
/\x3word/,
-
/#{"\x80".force_encoding("ASCII-8BIT")}/
-
)
-
end
-
-
1
context "for unicode sequences" do
-
1
examples_exist_and_match(
-
/\u6829/,
-
/\uabcd/,
-
/\u{42}word/
-
)
-
end
-
-
end
-
end