# encoding: utf-8 # Copyright (C) 2016 Mikio Ikoma require 'regextest/common' require 'regextest/back/element' require 'regextest/back/result' # Main class of back-end. Construct candidates array and narrow down class Regextest::Back::Main include Regextest::Common def initialize(json_obj, max_nest, retry_count = 0) @json_obj = json_obj @max_nest = max_nest @past_max_nest = 0 # max nest of the past @retry_count = retry_count @parens_hash = {} # hash to keep string generated by parentheses @nest = 0 # current nest of back-reference @quit_mode = false # flag for preventing from increase of nest # if true, \g is restrained if possible end def generate # seek parentheses because there are references defined ahead seek_parens(@json_obj) # generate pre-result of matched string (pre-result contains candidates of letters) param = {} pre_result = generate_candidates(@json_obj, param) return nil unless pre_result TstLog("pre_result1:\n" + pre_result.map{|elem| elem.inspect}.join("\n")) # narrow down the candidates result = narrow_down_candidates(pre_result) TstLog("pre_result2:\n" + result.inspect) return nil if !result || !result.narrow_down # fixes result result.fix result end # seek parentheses def seek_parens(target) if(target["type"] == "LEX_PAREN") @parens_hash[target["refer_name"]] = {:target => target} end if(target["value"]) if( Array === target["value"]) target["value"].each{|child| seek_parens(child)} else seek_parens(target["value"]) end end end # generate pre-result of matched string (pre-result contains candidates of letters) def generate_candidates(target, param) # puts "MATCH type:#{target["type"]}" result = nil case target["type"] when "LEX_SEQ" # sequence of letters or parentheses result = generate_candidates_seq(target, param) when "LEX_SELECT" result = generate_candidates_select(target, param) when "LEX_PAREN" result = generate_candidates_paren(target, param) when "LEX_CHAR_CLASS" result = generate_candidates_char_class(target, param) when "LEX_BRACKET", "LEX_SIMPLIFIED_CLASS", "LEX_ANY_LETTER", "LEX_POSIX_CHAR_CLASS", "LEX_UNICODE_CLASS", "LEX_UNICODE_CLASS_BRACKET" result = generate_candidates(target["value"], param) when "LEX_REPEAT" result = generate_candidates_repeat(target, param) when "LEX_RANGE" result = generate_candidates_range(target, param) when "LEX_BACK_REFER", "LEX_NAMED_REFER" result = generate_candidates_back_refer(target, param) when "LEX_NAMED_GENERATE" result = generate_candidates_named_generate(target, param) when "LEX_CHAR" result = generate_candidates_char(target, param) when "LEX_ANC_LINE_BEGIN" result = Regextest::Back::Element.new({cmd: :CMD_ANC_LINE_BEGIN}) when "LEX_ANC_LINE_END" result = Regextest::Back::Element.new({cmd: :CMD_ANC_LINE_END}) when "LEX_ANC_WORD_BOUND" result = Regextest::Back::Element.new({cmd: :CMD_ANC_WORD_BOUND}) when "LEX_ANC_WORD_UNBOUND" result = Regextest::Back::Element.new({cmd: :CMD_ANC_WORD_UNBOUND}) when "LEX_ANC_STRING_BEGIN" result = Regextest::Back::Element.new({cmd: :CMD_ANC_STRING_BEGIN}) when "LEX_ANC_STRING_END" result = Regextest::Back::Element.new({cmd: :CMD_ANC_STRING_END}) when "LEX_ANC_STRING_END2" result = Regextest::Back::Element.new({cmd: :CMD_ANC_STRING_END2}) when "LEX_ANC_MATCH_START" result = Regextest::Back::Element.new({cmd: :CMD_ANC_MATCH_START}) when "LEX_ANC_LOOK_BEHIND2" result = Regextest::Back::Element.new({cmd: :CMD_ANC_LOOK_BEHIND2}) when "LEX_OPTION_PAREN" # options are processed at front-end result = [] when "LEX_EMPTY" result = [] else raise "#{target["type"]} not implemented (from generate_candidates routine)" end result end # sequence of letters or parentheses def generate_candidates_seq(target, param) results = [] target["value"].each do |elem| generated_string = generate_candidates(elem, param) if(Array === generated_string) generated_string.flatten!(1) results += generated_string else results.push generated_string end end # nil if one element failed if(results.index(nil)) result = nil else # result = results.join("") result = results end result end # selection of sequence. such as (aa|b|c) def generate_candidates_select(target, param) if param[:forced_select] # index is specified by condition offset = param[:forced_select] param.delete :forced_select if target["value"][offset] result = generate_candidates(target["value"][offset], param) else # regexp such as /^(?:b|(a))(?(1)1)$/ match "b"! result = [] end else # success if there is at least one result offsets = (0 ... target["value"].size).to_a # shuffle if element size more than 1 offsets = TstShuffle(offsets) if offsets.size > 1 result = nil if param[:atomic] param.delete :atomic # if atomic, assure proceeding results not appeared offsets.each do | offset | result = [] (0...offset).each do | prev | la_result = generate_candidates(target["value"][prev], param) result.push Regextest::Back::Element.new({cmd: :CMD_NOT_LOOK_AHEAD, result: la_result}) end result.push generate_candidates(target["value"][offset], param) break if(!result.find{|elem| !elem }) end elsif negative_type = param[:negative] # if negative, assure all results not appeared result = [] offsets.each do | offset | la_result = generate_candidates(target["value"][offset], param) la_result.each do | elem | if elem.command == :CMD_NOT_LOOK_AHEAD result.push elem else result.push Regextest::Back::Element.new({cmd: negative_type, result: la_result}) end end end param.delete :negative else offsets.each do | offset | result = generate_candidates(target["value"][offset], param) break if(result) end end end result end # parenthesis def generate_candidates_paren(target, param) # analyze options of the parenthesis paren_prefix = target["prefix"] # pp target["prefix"] if(paren_prefix == "<=") lb_result = generate_candidates(target["value"], param) result = Regextest::Back::Element.new({cmd: :CMD_LOOK_BEHIND, result: lb_result}) elsif(paren_prefix == "=") la_result = generate_candidates(target["value"], param) result = Regextest::Back::Element.new({cmd: :CMD_LOOK_AHEAD, result: la_result}) elsif(paren_prefix == "") # atomic group param[:atomic] = true generate_string = generate_candidates(target["value"], param) @parens_hash[target["refer_name"]][:generated] ||= [] @parens_hash[target["refer_name"]][:generated][@nest] = generate_string result = generate_string elsif(paren_prefix == "") # simple parenthesis generate_string = generate_candidates(target["value"], param) @parens_hash[target["refer_name"]][:generated] ||= [] @parens_hash[target["refer_name"]][:generated][@nest] = generate_string result = generate_string else # when condition is specified select_num = nil if(target["condition_name"] && target["condition_name"].length > 0) if @parens_hash[target["condition_name"]][:generated] select_num = 0 else select_num = 1 end end if(select_num == 1 && target["value"]["type"] != "LEX_SELECT") result = nil else param[:forced_select] = select_num generate_string = generate_candidates(target["value"], param) @parens_hash[target["refer_name"]][:generated] ||= [] @parens_hash[target["refer_name"]][:generated][@nest] = generate_string result = generate_string end end result end # char class def generate_candidates_char_class(target, param) charset = target["charset"] results = Regextest::Back::Element.new({cmd: :CMD_SELECT, ranges: [], charset: charset}) target["value"].each do | elem | if sub_results = generate_candidates(elem, param) results.union sub_results end end if results.size > 0 result = results else result = nil end result end # repeat def generate_candidates_repeat(target, param) max_repeat = target["max_repeat"] min_repeat = target["min_repeat"] # reduce repeat count if retry and there are one or more \g calls if @retry_count > 0 && @past_max_nest > 0 @retry_count.times{ max_repeat = (max_repeat + 1)/2 } end if(@quit_mode) repeat = min_repeat elsif(max_repeat > min_repeat) repeat = min_repeat + TstRand(max_repeat - min_repeat + 1) else repeat = min_repeat end result = [] if target["repeat_option"].index("reluctant") result.push Regextest::Back::Element.new({cmd: :CMD_ANC_RELUCTANT_BEGIN, id: target["id"]}) end # puts "repeat=#{repeat} quit=#{@quit_mode} nest=#{@nest}" repeat.times do | current_repeat | if( elem = generate_candidates(target["value"], param)) result.push elem else result = nil break end elem.flatten! if Array === elem # flatten considering duplicated repeat # quit to repeat if the first element is begin anchor if current_repeat >= min_repeat && elem.size > 0 && elem[0].respond_to?(:command) && elem[-1].respond_to?(:command) break if elem[0].command == :CMD_ANC_LINE_BEGIN && !elem[-1].new_line? break if elem[0].command == :CMD_ANC_STRING_BEGIN end end if target["repeat_option"].index("reluctant") result.push Regextest::Back::Element.new({cmd: :CMD_ANC_RELUCTANT_END, id: target["id"]}) end if target["repeat_option"].index("possessive") la_result = [ generate_candidates(target["value"], param) ].flatten result.push Regextest::Back::Element.new({cmd: :CMD_NOT_LOOK_AHEAD, result: la_result}) end result end # range def generate_candidates_range(target, param) charset = target["charset"] letter = [] codepoints = (target["begin"]..target["end"]) result = Regextest::Back::Element.new({cmd: :CMD_SELECT, ranges: [codepoints], charset: charset}) end # back_refer def generate_candidates_back_refer(target, param) if @parens_hash[target["refer_name"]][:generated] relative_num = -1 # default value if target["relative_num"] != "" work = @nest + target["relative_num"].to_i return nil if(work < 0 || !@parens_hash[target["refer_name"]][:generated][work]) relative_num = work end # puts "relative: #{relative_num}, nest=#{@nest}, :#{target}" result = @parens_hash[target["refer_name"]][:generated][relative_num] # Somehow /(^a)\1/ or /(\ba)\1/ must match with "aa" if result.size > 0 result = result.select{|elem| elem.command == :CMD_SELECT} end else result = nil end result end # named generate def generate_candidates_named_generate(target, param) @quit_mode = true if(@nest >= @max_nest) if(@quit_mode) result = nil else @nest += 1 @past_max_nest = @nest if @nest > @past_max_nest if target["refer_name"] == "$$_0" # recursively call whole expression result = generate_candidates(@json_obj, param) else result = generate_candidates(@parens_hash[target["refer_name"]][:target], param) end @nest -= 1 end result end # char def generate_candidates_char(target, param) charset = target["charset"] case target["value"] when String codepoint = target["value"].unpack("U*")[0] result = Regextest::Back::Element.new( { cmd: :CMD_SELECT, ranges: [codepoint..codepoint], charset: charset } ) else result = generate_candidates(target["value"], param) end result end # narrow down candidates considering anchors def narrow_down_candidates(candidate_array) # pp candidate_array results = Regextest::Back::Result.new candidate_array.each do | elem | command = elem.command case command when :CMD_SELECT results.push_body elem when :CMD_LOOK_AHEAD, :CMD_NOT_LOOK_AHEAD if(sub_results = narrow_down_candidates(elem.param[:result])) results.add_look_ahead(command, sub_results) else return nil end when :CMD_LOOK_BEHIND, :CMD_NOT_LOOK_BEHIND if(sub_results = narrow_down_candidates(elem.param[:result])) results.add_look_behind(command, sub_results) else return nil end when :CMD_ANC_LINE_BEGIN, :CMD_ANC_LINE_END, :CMD_ANC_WORD_BOUND, :CMD_ANC_WORD_UNBOUND, :CMD_ANC_STRING_BEGIN, :CMD_ANC_STRING_END, :CMD_ANC_STRING_END2, :CMD_ANC_MATCH_START, :CMD_ANC_LOOK_BEHIND2 results.add_anchor(command) when :CMD_ANC_RELUCTANT_BEGIN, :CMD_ANC_RELUCTANT_END, :CMD_ANC_POSSESSIVE_BEGIN, :CMD_ANC_POSSESSIVE_END results.add_reluctant_repeat(elem) else raise "inner error, invalid command at checking anchors: #{command}" end end if !results.merge return nil end results end end # Test suite (execute when this file is specified in command line) if __FILE__ == $0 end