lib/regextest/back/main.rb in regextest-0.1.5 vs lib/regextest/back/main.rb in regextest-0.1.6
- old
+ new
@@ -10,10 +10,11 @@
class Regextest::Back::Main
include Regextest::Common
def initialize(json_obj, max_nest, retry_count = 0)
@json_obj = json_obj
@max_nest = max_nest
+ @past_max_nest = 0 # max nest of the past
@retry_count = retry_count
@parens_hash = {} # hash to keep string generated by parentheses
@nest = 0 # current nest of back-reference
@quit_mode = false # flag for preventing from increase of nest
# if true, \g<foo> is restrained if possible
@@ -22,19 +23,19 @@
def generate
# seek parentheses because there are references defined ahead
seek_parens(@json_obj)
# generate pre-result of matched string (pre-result contains candidates of letters)
- pre_result = generate_candidates({json: @json_obj})
+ param = {}
+ pre_result = generate_candidates(@json_obj, param)
return nil unless pre_result
- TstLog("pre_result1:\n" + pre_result.inspect)
-
+ TstLog("pre_result1:\n" + pre_result.map{|elem| elem.inspect}.join("\n"))
+
# narrow down the candidates
result = narrow_down_candidates(pre_result)
TstLog("pre_result2:\n" + result.inspect)
return nil if !result || !result.narrow_down
-
# fixes result
result.fix
result
end
@@ -52,36 +53,35 @@
end
end
end
# generate pre-result of matched string (pre-result contains candidates of letters)
- def generate_candidates(param)
- target = param[:json]
+ def generate_candidates(target, param)
# puts "MATCH type:#{target["type"]}"
result = nil
case target["type"]
when "LEX_SEQ" # sequence of letters or parentheses
- result = generate_candidates_seq(param)
+ result = generate_candidates_seq(target, param)
when "LEX_SELECT"
- result = generate_candidates_select(param)
+ result = generate_candidates_select(target, param)
when "LEX_PAREN"
- result = generate_candidates_paren(param)
+ result = generate_candidates_paren(target, param)
when "LEX_CHAR_CLASS"
- result = generate_candidates_char_class(param)
- when "LEX_BRACKET", "LEX_SIMPLIFIED_CLASS", "LEX_ANY_LETTER", "LEX_POSIX_CHAR_CLASS", "LEX_UNICODE_CLASS"
- result = generate_candidates({json: target["value"]})
+ result = generate_candidates_char_class(target, param)
+ when "LEX_BRACKET", "LEX_SIMPLIFIED_CLASS", "LEX_ANY_LETTER", "LEX_POSIX_CHAR_CLASS", "LEX_UNICODE_CLASS", "LEX_UNICODE_CLASS_BRACKET"
+ result = generate_candidates(target["value"], param)
when "LEX_REPEAT"
- result = generate_candidates_repeat(param)
+ result = generate_candidates_repeat(target, param)
when "LEX_RANGE"
- result = generate_candidates_range(param)
+ result = generate_candidates_range(target, param)
when "LEX_BACK_REFER", "LEX_NAMED_REFER"
- result = generate_candidates_back_refer(param)
+ result = generate_candidates_back_refer(target, param)
when "LEX_NAMED_GENERATE"
- result = generate_candidates_named_generate(param)
+ result = generate_candidates_named_generate(target, param)
when "LEX_CHAR"
- result = generate_candidates_char(param)
+ result = generate_candidates_char(target, param)
when "LEX_ANC_LINE_BEGIN"
result = Regextest::Back::Element.new({cmd: :CMD_ANC_LINE_BEGIN})
when "LEX_ANC_LINE_END"
result = Regextest::Back::Element.new({cmd: :CMD_ANC_LINE_END})
when "LEX_ANC_WORD_BOUND"
@@ -107,15 +107,14 @@
end
result
end
# sequence of letters or parentheses
- def generate_candidates_seq(param)
- target = param[:json]
+ def generate_candidates_seq(target, param)
results = []
target["value"].each do |elem|
- generated_string = generate_candidates({json: elem})
+ generated_string = generate_candidates(elem, param)
if(Array === generated_string)
generated_string.flatten!(1)
results += generated_string
else
results.push generated_string
@@ -130,60 +129,90 @@
end
result
end
# selection of sequence. such as (aa|b|c)
- def generate_candidates_select(param)
- target = param[:json]
+ def generate_candidates_select(target, param)
if param[:forced_select]
- # index is specified by condition
- if target["value"][param[:forced_select]]
- result = generate_candidates({json: target["value"][param[:forced_select]]})
+ # index is specified by condition
+ offset = param[:forced_select]
+ param.delete :forced_select
+ if target["value"][offset]
+ result = generate_candidates(target["value"][offset], param)
else
# regexp such as /^(?:b|(a))(?(1)1)$/ match "b"!
result = []
end
else
# success if there is at least one result
offsets = (0 ... target["value"].size).to_a
- if !param[:atomic] && offsets.size > 1
- offsets = TstShuffle(offsets) # shuffle if not atomic group (this proceduce is not sufficient...)
- end
+
+ # shuffle if element size more than 1
+ offsets = TstShuffle(offsets) if offsets.size > 1
+
result = nil
- offsets.each do | offset |
- result = generate_candidates({json: target["value"][offset]})
- break if(result)
+ if param[:atomic]
+ param.delete :atomic
+ # if atomic, assure proceeding results not appeared
+ offsets.each do | offset |
+ result = []
+ (0...offset).each do | prev |
+ la_result = generate_candidates(target["value"][prev], param)
+ result.push Regextest::Back::Element.new({cmd: :CMD_NOT_LOOK_AHEAD, result: la_result})
+ end
+ result.push generate_candidates(target["value"][offset], param)
+ break if(!result.find{|elem| !elem })
+ end
+ elsif negative_type = param[:negative]
+ # if negative, assure all results not appeared
+ result = []
+ offsets.each do | offset |
+ la_result = generate_candidates(target["value"][offset], param)
+ la_result.each do | elem |
+ if elem.command == :CMD_NOT_LOOK_AHEAD
+ result.push elem
+ else
+ result.push Regextest::Back::Element.new({cmd: negative_type, result: la_result})
+ end
+ end
+ end
+ param.delete :negative
+ else
+ offsets.each do | offset |
+ result = generate_candidates(target["value"][offset], param)
+ break if(result)
+ end
end
end
result
end
# parenthesis
- def generate_candidates_paren(param)
- target = param[:json]
+ def generate_candidates_paren(target, param)
# analyze options of the parenthesis
paren_prefix = target["prefix"]
# pp target["prefix"]
if(paren_prefix == "<=")
- lb_result = generate_candidates({json: target["value"]})
+ lb_result = generate_candidates(target["value"], param)
result = Regextest::Back::Element.new({cmd: :CMD_LOOK_BEHIND, result: lb_result})
elsif(paren_prefix == "=")
- la_result = generate_candidates({json: target["value"]})
+ la_result = generate_candidates(target["value"], param)
result = Regextest::Back::Element.new({cmd: :CMD_LOOK_AHEAD, result: la_result})
elsif(paren_prefix == "<!")
- lb_result = generate_candidates({json: target["value"]})
- result = Regextest::Back::Element.new({cmd: :CMD_NOT_LOOK_BEHIND, result: lb_result})
+ param[:negative] = :CMD_NOT_LOOK_BEHIND
+ result = generate_candidates(target["value"], param)
elsif(paren_prefix == "!")
- la_result = generate_candidates({json: target["value"]})
- result = Regextest::Back::Element.new({cmd: :CMD_NOT_LOOK_AHEAD, result: la_result})
+ param[:negative] = :CMD_NOT_LOOK_AHEAD
+ result = generate_candidates(target["value"], param)
elsif(paren_prefix == ">") # atomic group
- generate_string = generate_candidates({json: target["value"], atomic: true})
+ param[:atomic] = true
+ generate_string = generate_candidates(target["value"], param)
@parens_hash[target["refer_name"]][:generated] ||= []
@parens_hash[target["refer_name"]][:generated][@nest] = generate_string
result = generate_string
elsif(paren_prefix == "") # simple parenthesis
- generate_string = generate_candidates({json: target["value"]})
+ generate_string = generate_candidates(target["value"], param)
@parens_hash[target["refer_name"]][:generated] ||= []
@parens_hash[target["refer_name"]][:generated][@nest] = generate_string
result = generate_string
else
# when condition is specified
@@ -197,26 +226,27 @@
end
if(select_num == 1 && target["value"]["type"] != "LEX_SELECT")
result = nil
else
- generate_string = generate_candidates({json: target["value"], forced_select: select_num})
+ param[:forced_select] = select_num
+ generate_string = generate_candidates(target["value"], param)
@parens_hash[target["refer_name"]][:generated] ||= []
@parens_hash[target["refer_name"]][:generated][@nest] = generate_string
result = generate_string
end
end
result
end
# char class
- def generate_candidates_char_class(param)
- target = param[:json]
- results = Regextest::Back::Element.new({cmd: :CMD_SELECT, data: []})
+ def generate_candidates_char_class(target, param)
+ charset = target["charset"]
+ results = Regextest::Back::Element.new({cmd: :CMD_SELECT, ranges: [], charset: charset})
target["value"].each do | elem |
- if sub_results = generate_candidates({json: elem})
+ if sub_results = generate_candidates(elem, param)
results.union sub_results
end
end
if results.size > 0
result = results
@@ -225,33 +255,35 @@
end
result
end
# repeat
- def generate_candidates_repeat(param)
- target = param[:json]
+ def generate_candidates_repeat(target, param)
max_repeat = target["max_repeat"]
min_repeat = target["min_repeat"]
- if @retry_count > 0
+ # reduce repeat count if retry and there are one or more \g<foo> calls
+ if @retry_count > 0 && @past_max_nest > 0
@retry_count.times{ max_repeat = (max_repeat + 1)/2 }
end
if(@quit_mode)
repeat = min_repeat
elsif(max_repeat > min_repeat)
repeat = min_repeat + TstRand(max_repeat - min_repeat + 1)
else
repeat = min_repeat
end
+
result = []
if target["repeat_option"].index("reluctant")
result.push Regextest::Back::Element.new({cmd: :CMD_ANC_RELUCTANT_BEGIN, id: target["id"]})
end
+
# puts "repeat=#{repeat} quit=#{@quit_mode} nest=#{@nest}"
repeat.times do
- if( elem = generate_candidates({json: target["value"]}))
+ if( elem = generate_candidates(target["value"], param))
result.push elem
else
result = nil
break
end
@@ -261,69 +293,87 @@
if elem.size > 0 && elem[0].respond_to?(:command) && elem[-1].respond_to?(:command)
break if elem[0].command == :CMD_ANC_LINE_BEGIN && !elem[-1].new_line?
break if elem[0].command == :CMD_ANC_STRING_BEGIN
end
end
+
if target["repeat_option"].index("reluctant")
result.push Regextest::Back::Element.new({cmd: :CMD_ANC_RELUCTANT_END, id: target["id"]})
end
+
+ if target["repeat_option"].index("possessive")
+ la_result = [ generate_candidates(target["value"], param) ].flatten
+ result.push Regextest::Back::Element.new({cmd: :CMD_NOT_LOOK_AHEAD, result: la_result})
+ end
result
end
# range
- def generate_candidates_range(param)
- target = param[:json]
+ def generate_candidates_range(target, param)
+ charset = target["charset"]
letter = []
- codepoints = (target["begin"]..target["end"]).to_a
- result = Regextest::Back::Element.new({cmd: :CMD_SELECT, data: codepoints})
+ codepoints = (target["begin"]..target["end"])
+ result = Regextest::Back::Element.new({cmd: :CMD_SELECT, ranges: [codepoints], charset: charset})
end
# back_refer
- def generate_candidates_back_refer(param)
- target = param[:json]
+ def generate_candidates_back_refer(target, param)
if @parens_hash[target["refer_name"]][:generated]
relative_num = -1 # default value
if target["relative_num"] != ""
work = @nest + target["relative_num"].to_i
return nil if(work < 0 || !@parens_hash[target["refer_name"]][:generated][work])
relative_num = work
end
# puts "relative: #{relative_num}, nest=#{@nest}, :#{target}"
result = @parens_hash[target["refer_name"]][:generated][relative_num]
+
+ # Somehow /(^a)\1/ must match with "aa"
+ if result.size > 0 &&
+ (result[0].command == :CMD_ANC_LINE_BEGIN ||
+ result[0].command == :CMD_ANC_STRING_BEGIN)
+ result = result[1..-1] # ignore first anchor
+ end
else
result = nil
end
result
end
# named generate
- def generate_candidates_named_generate(param)
- target = param[:json]
+ def generate_candidates_named_generate(target, param)
@quit_mode = true if(@nest >= @max_nest)
if(@quit_mode)
result = nil
else
@nest += 1
+ @past_max_nest = @nest if @nest > @past_max_nest
if target["refer_name"] == "$$_0" # recursively call whole expression
- result = generate_candidates({json: @json_obj})
+ result = generate_candidates(@json_obj, param)
else
- result = generate_candidates({json: @parens_hash[target["refer_name"]][:target]})
+ result = generate_candidates(@parens_hash[target["refer_name"]][:target], param)
end
@nest -= 1
end
result
end
# char
- def generate_candidates_char(param)
- target = param[:json]
+ def generate_candidates_char(target, param)
+ charset = target["charset"]
case target["value"]
when String
codepoint = target["value"].unpack("U*")[0]
- result = Regextest::Back::Element.new({cmd: :CMD_SELECT, data: [codepoint]})
+ result = Regextest::Back::Element.new(
+ {
+ cmd: :CMD_SELECT,
+ ranges: [codepoint..codepoint],
+ charset: charset
+ }
+ )
else
- result = generate_candidates({json: target["value"]})
+ result = generate_candidates(target["value"], param)
end
result
end
# narrow down candidates considering anchors
@@ -349,10 +399,11 @@
end
when :CMD_ANC_LINE_BEGIN, :CMD_ANC_LINE_END, :CMD_ANC_WORD_BOUND, :CMD_ANC_WORD_UNBOUND,
:CMD_ANC_STRING_BEGIN, :CMD_ANC_STRING_END, :CMD_ANC_STRING_END2, :CMD_ANC_MATCH_START,
:CMD_ANC_LOOK_BEHIND2
results.add_anchor(command)
- when :CMD_ANC_RELUCTANT_BEGIN, :CMD_ANC_RELUCTANT_END
+ when :CMD_ANC_RELUCTANT_BEGIN, :CMD_ANC_RELUCTANT_END,
+ :CMD_ANC_POSSESSIVE_BEGIN, :CMD_ANC_POSSESSIVE_END
results.add_reluctant_repeat(elem)
else
raise "inner error, invalid command at checking anchors: #{command}"
end
end