lib/regextest/back/main.rb in regextest-0.1.5 vs lib/regextest/back/main.rb in regextest-0.1.6

- old
+ new

@@ -10,10 +10,11 @@ class Regextest::Back::Main include Regextest::Common def initialize(json_obj, max_nest, retry_count = 0) @json_obj = json_obj @max_nest = max_nest + @past_max_nest = 0 # max nest of the past @retry_count = retry_count @parens_hash = {} # hash to keep string generated by parentheses @nest = 0 # current nest of back-reference @quit_mode = false # flag for preventing from increase of nest # if true, \g<foo> is restrained if possible @@ -22,19 +23,19 @@ def generate # seek parentheses because there are references defined ahead seek_parens(@json_obj) # generate pre-result of matched string (pre-result contains candidates of letters) - pre_result = generate_candidates({json: @json_obj}) + param = {} + pre_result = generate_candidates(@json_obj, param) return nil unless pre_result - TstLog("pre_result1:\n" + pre_result.inspect) - + TstLog("pre_result1:\n" + pre_result.map{|elem| elem.inspect}.join("\n")) + # narrow down the candidates result = narrow_down_candidates(pre_result) TstLog("pre_result2:\n" + result.inspect) return nil if !result || !result.narrow_down - # fixes result result.fix result end @@ -52,36 +53,35 @@ end end end # generate pre-result of matched string (pre-result contains candidates of letters) - def generate_candidates(param) - target = param[:json] + def generate_candidates(target, param) # puts "MATCH type:#{target["type"]}" result = nil case target["type"] when "LEX_SEQ" # sequence of letters or parentheses - result = generate_candidates_seq(param) + result = generate_candidates_seq(target, param) when "LEX_SELECT" - result = generate_candidates_select(param) + result = generate_candidates_select(target, param) when "LEX_PAREN" - result = generate_candidates_paren(param) + result = generate_candidates_paren(target, param) when "LEX_CHAR_CLASS" - result = generate_candidates_char_class(param) - when "LEX_BRACKET", "LEX_SIMPLIFIED_CLASS", "LEX_ANY_LETTER", "LEX_POSIX_CHAR_CLASS", "LEX_UNICODE_CLASS" - result = generate_candidates({json: target["value"]}) + result = generate_candidates_char_class(target, param) + when "LEX_BRACKET", "LEX_SIMPLIFIED_CLASS", "LEX_ANY_LETTER", "LEX_POSIX_CHAR_CLASS", "LEX_UNICODE_CLASS", "LEX_UNICODE_CLASS_BRACKET" + result = generate_candidates(target["value"], param) when "LEX_REPEAT" - result = generate_candidates_repeat(param) + result = generate_candidates_repeat(target, param) when "LEX_RANGE" - result = generate_candidates_range(param) + result = generate_candidates_range(target, param) when "LEX_BACK_REFER", "LEX_NAMED_REFER" - result = generate_candidates_back_refer(param) + result = generate_candidates_back_refer(target, param) when "LEX_NAMED_GENERATE" - result = generate_candidates_named_generate(param) + result = generate_candidates_named_generate(target, param) when "LEX_CHAR" - result = generate_candidates_char(param) + result = generate_candidates_char(target, param) when "LEX_ANC_LINE_BEGIN" result = Regextest::Back::Element.new({cmd: :CMD_ANC_LINE_BEGIN}) when "LEX_ANC_LINE_END" result = Regextest::Back::Element.new({cmd: :CMD_ANC_LINE_END}) when "LEX_ANC_WORD_BOUND" @@ -107,15 +107,14 @@ end result end # sequence of letters or parentheses - def generate_candidates_seq(param) - target = param[:json] + def generate_candidates_seq(target, param) results = [] target["value"].each do |elem| - generated_string = generate_candidates({json: elem}) + generated_string = generate_candidates(elem, param) if(Array === generated_string) generated_string.flatten!(1) results += generated_string else results.push generated_string @@ -130,60 +129,90 @@ end result end # selection of sequence. such as (aa|b|c) - def generate_candidates_select(param) - target = param[:json] + def generate_candidates_select(target, param) if param[:forced_select] - # index is specified by condition - if target["value"][param[:forced_select]] - result = generate_candidates({json: target["value"][param[:forced_select]]}) + # index is specified by condition + offset = param[:forced_select] + param.delete :forced_select + if target["value"][offset] + result = generate_candidates(target["value"][offset], param) else # regexp such as /^(?:b|(a))(?(1)1)$/ match "b"! result = [] end else # success if there is at least one result offsets = (0 ... target["value"].size).to_a - if !param[:atomic] && offsets.size > 1 - offsets = TstShuffle(offsets) # shuffle if not atomic group (this proceduce is not sufficient...) - end + + # shuffle if element size more than 1 + offsets = TstShuffle(offsets) if offsets.size > 1 + result = nil - offsets.each do | offset | - result = generate_candidates({json: target["value"][offset]}) - break if(result) + if param[:atomic] + param.delete :atomic + # if atomic, assure proceeding results not appeared + offsets.each do | offset | + result = [] + (0...offset).each do | prev | + la_result = generate_candidates(target["value"][prev], param) + result.push Regextest::Back::Element.new({cmd: :CMD_NOT_LOOK_AHEAD, result: la_result}) + end + result.push generate_candidates(target["value"][offset], param) + break if(!result.find{|elem| !elem }) + end + elsif negative_type = param[:negative] + # if negative, assure all results not appeared + result = [] + offsets.each do | offset | + la_result = generate_candidates(target["value"][offset], param) + la_result.each do | elem | + if elem.command == :CMD_NOT_LOOK_AHEAD + result.push elem + else + result.push Regextest::Back::Element.new({cmd: negative_type, result: la_result}) + end + end + end + param.delete :negative + else + offsets.each do | offset | + result = generate_candidates(target["value"][offset], param) + break if(result) + end end end result end # parenthesis - def generate_candidates_paren(param) - target = param[:json] + def generate_candidates_paren(target, param) # analyze options of the parenthesis paren_prefix = target["prefix"] # pp target["prefix"] if(paren_prefix == "<=") - lb_result = generate_candidates({json: target["value"]}) + lb_result = generate_candidates(target["value"], param) result = Regextest::Back::Element.new({cmd: :CMD_LOOK_BEHIND, result: lb_result}) elsif(paren_prefix == "=") - la_result = generate_candidates({json: target["value"]}) + la_result = generate_candidates(target["value"], param) result = Regextest::Back::Element.new({cmd: :CMD_LOOK_AHEAD, result: la_result}) elsif(paren_prefix == "<!") - lb_result = generate_candidates({json: target["value"]}) - result = Regextest::Back::Element.new({cmd: :CMD_NOT_LOOK_BEHIND, result: lb_result}) + param[:negative] = :CMD_NOT_LOOK_BEHIND + result = generate_candidates(target["value"], param) elsif(paren_prefix == "!") - la_result = generate_candidates({json: target["value"]}) - result = Regextest::Back::Element.new({cmd: :CMD_NOT_LOOK_AHEAD, result: la_result}) + param[:negative] = :CMD_NOT_LOOK_AHEAD + result = generate_candidates(target["value"], param) elsif(paren_prefix == ">") # atomic group - generate_string = generate_candidates({json: target["value"], atomic: true}) + param[:atomic] = true + generate_string = generate_candidates(target["value"], param) @parens_hash[target["refer_name"]][:generated] ||= [] @parens_hash[target["refer_name"]][:generated][@nest] = generate_string result = generate_string elsif(paren_prefix == "") # simple parenthesis - generate_string = generate_candidates({json: target["value"]}) + generate_string = generate_candidates(target["value"], param) @parens_hash[target["refer_name"]][:generated] ||= [] @parens_hash[target["refer_name"]][:generated][@nest] = generate_string result = generate_string else # when condition is specified @@ -197,26 +226,27 @@ end if(select_num == 1 && target["value"]["type"] != "LEX_SELECT") result = nil else - generate_string = generate_candidates({json: target["value"], forced_select: select_num}) + param[:forced_select] = select_num + generate_string = generate_candidates(target["value"], param) @parens_hash[target["refer_name"]][:generated] ||= [] @parens_hash[target["refer_name"]][:generated][@nest] = generate_string result = generate_string end end result end # char class - def generate_candidates_char_class(param) - target = param[:json] - results = Regextest::Back::Element.new({cmd: :CMD_SELECT, data: []}) + def generate_candidates_char_class(target, param) + charset = target["charset"] + results = Regextest::Back::Element.new({cmd: :CMD_SELECT, ranges: [], charset: charset}) target["value"].each do | elem | - if sub_results = generate_candidates({json: elem}) + if sub_results = generate_candidates(elem, param) results.union sub_results end end if results.size > 0 result = results @@ -225,33 +255,35 @@ end result end # repeat - def generate_candidates_repeat(param) - target = param[:json] + def generate_candidates_repeat(target, param) max_repeat = target["max_repeat"] min_repeat = target["min_repeat"] - if @retry_count > 0 + # reduce repeat count if retry and there are one or more \g<foo> calls + if @retry_count > 0 && @past_max_nest > 0 @retry_count.times{ max_repeat = (max_repeat + 1)/2 } end if(@quit_mode) repeat = min_repeat elsif(max_repeat > min_repeat) repeat = min_repeat + TstRand(max_repeat - min_repeat + 1) else repeat = min_repeat end + result = [] if target["repeat_option"].index("reluctant") result.push Regextest::Back::Element.new({cmd: :CMD_ANC_RELUCTANT_BEGIN, id: target["id"]}) end + # puts "repeat=#{repeat} quit=#{@quit_mode} nest=#{@nest}" repeat.times do - if( elem = generate_candidates({json: target["value"]})) + if( elem = generate_candidates(target["value"], param)) result.push elem else result = nil break end @@ -261,69 +293,87 @@ if elem.size > 0 && elem[0].respond_to?(:command) && elem[-1].respond_to?(:command) break if elem[0].command == :CMD_ANC_LINE_BEGIN && !elem[-1].new_line? break if elem[0].command == :CMD_ANC_STRING_BEGIN end end + if target["repeat_option"].index("reluctant") result.push Regextest::Back::Element.new({cmd: :CMD_ANC_RELUCTANT_END, id: target["id"]}) end + + if target["repeat_option"].index("possessive") + la_result = [ generate_candidates(target["value"], param) ].flatten + result.push Regextest::Back::Element.new({cmd: :CMD_NOT_LOOK_AHEAD, result: la_result}) + end result end # range - def generate_candidates_range(param) - target = param[:json] + def generate_candidates_range(target, param) + charset = target["charset"] letter = [] - codepoints = (target["begin"]..target["end"]).to_a - result = Regextest::Back::Element.new({cmd: :CMD_SELECT, data: codepoints}) + codepoints = (target["begin"]..target["end"]) + result = Regextest::Back::Element.new({cmd: :CMD_SELECT, ranges: [codepoints], charset: charset}) end # back_refer - def generate_candidates_back_refer(param) - target = param[:json] + def generate_candidates_back_refer(target, param) if @parens_hash[target["refer_name"]][:generated] relative_num = -1 # default value if target["relative_num"] != "" work = @nest + target["relative_num"].to_i return nil if(work < 0 || !@parens_hash[target["refer_name"]][:generated][work]) relative_num = work end # puts "relative: #{relative_num}, nest=#{@nest}, :#{target}" result = @parens_hash[target["refer_name"]][:generated][relative_num] + + # Somehow /(^a)\1/ must match with "aa" + if result.size > 0 && + (result[0].command == :CMD_ANC_LINE_BEGIN || + result[0].command == :CMD_ANC_STRING_BEGIN) + result = result[1..-1] # ignore first anchor + end else result = nil end result end # named generate - def generate_candidates_named_generate(param) - target = param[:json] + def generate_candidates_named_generate(target, param) @quit_mode = true if(@nest >= @max_nest) if(@quit_mode) result = nil else @nest += 1 + @past_max_nest = @nest if @nest > @past_max_nest if target["refer_name"] == "$$_0" # recursively call whole expression - result = generate_candidates({json: @json_obj}) + result = generate_candidates(@json_obj, param) else - result = generate_candidates({json: @parens_hash[target["refer_name"]][:target]}) + result = generate_candidates(@parens_hash[target["refer_name"]][:target], param) end @nest -= 1 end result end # char - def generate_candidates_char(param) - target = param[:json] + def generate_candidates_char(target, param) + charset = target["charset"] case target["value"] when String codepoint = target["value"].unpack("U*")[0] - result = Regextest::Back::Element.new({cmd: :CMD_SELECT, data: [codepoint]}) + result = Regextest::Back::Element.new( + { + cmd: :CMD_SELECT, + ranges: [codepoint..codepoint], + charset: charset + } + ) else - result = generate_candidates({json: target["value"]}) + result = generate_candidates(target["value"], param) end result end # narrow down candidates considering anchors @@ -349,10 +399,11 @@ end when :CMD_ANC_LINE_BEGIN, :CMD_ANC_LINE_END, :CMD_ANC_WORD_BOUND, :CMD_ANC_WORD_UNBOUND, :CMD_ANC_STRING_BEGIN, :CMD_ANC_STRING_END, :CMD_ANC_STRING_END2, :CMD_ANC_MATCH_START, :CMD_ANC_LOOK_BEHIND2 results.add_anchor(command) - when :CMD_ANC_RELUCTANT_BEGIN, :CMD_ANC_RELUCTANT_END + when :CMD_ANC_RELUCTANT_BEGIN, :CMD_ANC_RELUCTANT_END, + :CMD_ANC_POSSESSIVE_BEGIN, :CMD_ANC_POSSESSIVE_END results.add_reluctant_repeat(elem) else raise "inner error, invalid command at checking anchors: #{command}" end end