# encoding: utf-8 module Boogex AND_REGEX = / AND / OR_REGEX = / OR / NOT_REGEX = / NOT / def self.convert(text) texts = text.split(NOT_REGEX) fail "The regex '#{text}' split more than twice on 'NOT'" if texts.size > 2 inclu_text = texts[0] exclu_text = texts[1] regex_hash = { inclusive_regex: run_through_convertors(inclu_text) } unless exclu_text.nil? regex_hash[:exclusive_regex] = run_through_convertors(exclu_text) regex_hash[:no_links] = true if exclu_text.include?('HTTP') validate_regex_syntax!(regex_hash[:exclusive_regex], text) end validate_regex_syntax!(regex_hash[:inclusive_regex], text) regex_hash end def self.validate_regex_syntax!(regex, text) # Note: This also checks that the regex is valid and returns RegExpError if it isn't including a description of what went wrong. fail "#{regex} matched on nothing or empty space. Huh?" if !' '.match(regex).nil? end def self.run_through_convertors(text) array = array_struct(text) array = ors_to_pipes(array) array = regex_formatting(array) regex_array_to_string(array) end private # This function converts a string into an array where brackets in the string are converted to an array structure # to allow further manipulation # "a OR (b) OR c" => ["a OR ", ["b"], " OR c"] # "a OR (b AND (c OR d)) OR e" => ["a OR ", ["b AND ", ["c OR d"]], " OR e"] def self.array_struct(text) inside_brackets = "[^\(\)]*" not_open_bracket = "[^\(]*" #This regex looks for anything in brackets OR anything with brackets in brackets OR anything with brackets in brackets in brackets regex = Regexp.new(get_bracket_regex) cuts = text.scan(regex).to_a.flatten.reject(&:nil?) # If nothing found then return orignal text return text if cuts.empty? # The text is now cut into an array where the bracketing of the string determines the elements # ie. "a OR (b) OR c" => ["a OR ", ["b"], " OR c"] text_array = cuts.inject([text]) do |a, cut| a.each_with_object([]) do |str, result| if !str.include?(cut) result << str else splits = str.split(cut) if splits.size == 2 result << splits.first cut_without_brackets = cut[1..-2] result << [cut_without_brackets] result << splits.last elsif splits.size == 1 && str.index(splits[0]) == 0 result << splits[0] cut_without_brackets = cut[1..-2] result << [cut_without_brackets] elsif splits.size == 1 && str.index(splits[0]) > 0 cut_without_brackets = cut[1..-2] result << [cut_without_brackets] result << splits[0] elsif splits.size == 0 cut_without_brackets = cut[1..-2] result << [cut_without_brackets] else fail "This should never happen" end end end end.compact # This recursively converts any brackets in the text back into the array_struct function # where the upper limit of recursion is 3 levels of bracketing. This is limitied by the regex # on line 9 but can be extended. # If the element of the array is a string then no recursion to apply. # If the element of the array is an array then iterate THAT through the array_struct function text_array.reject(&:empty?).each_with_object([]) do |str, result| if str.is_a?(String) result << str next end result << str.collect do |str| array_struct(str) end end end # This function converts the Lucene Boolean `OR` into regex `|` and removes any quotation marks def self.ors_to_pipes(obj) return obj.gsub(OR_REGEX, '|').gsub('"', '').gsub(/\-(?=([^\[]*\[[^\]]*\])*[^\[\]]*$)/, '\-').gsub("'", '') if obj.is_a?(String) raise "There are unclosed brackets in this boolean string" if has_unclosed_brackets?(obj) # This recursively applies this function to ensure all levels of the array are converted obj.collect do |text| ors_to_pipes(text) end end def self.has_unclosed_brackets?(obj) obj.any? do |o| o.count('(') != o.count(')') end end # This function begins to tranform the elements of the array structure to regex formatting # including: # - (a) Any elements that are not bookended by | are then wrapped in (?:) as this modularises # the regex of the elements of the structures # # - (b) Converting any Lucene Boolean `AND` into an AND array structure where the first element is "AND" and the # remaining elements of that array are the regexes that make up the `AND` # ie. ["pete AND james"] => ["AND", "pete", "james"] # ie. ["jenny AND", ["billy OR jimmy"]] => ["AND", "jenny", ["billy OR jimmy"]] def self.regex_formatting(obj) # (a) # if string then wrap it in brackets if needed and then return if obj.is_a?(String) if contain_AND?(obj) result = ['AND'] result = result + obj.split(AND_REGEX).reject(&:empty?).collect do |str| regex_formatting(str) end return result end needs_brackets = not_in_or?(obj) obj = wrap_in_brackets(obj) if needs_brackets return obj end # if an all string array, then check if any of the elements of the array need bracket wrapped and return if all_strings?(obj) if obj.any? do |str| contain_AND?(str) end result = obj.each_with_object(['AND']) do |str, arr| str.split(AND_REGEX).reject(&:empty?).collect do |str| arr << regex_formatting(str) end end return result end needs_brackets = obj.any? do |text| not_in_or?(text) end obj = obj.join('') obj = wrap_in_brackets(obj) if needs_brackets return obj end # (b) result = [] # If this level of bracket contains a string with `AND` in it, then consider this element an `AND` array result << 'AND' if obj.any? do |elem| contain_AND?(elem) end obj.each_with_object(result) do |text, result| if contain_AND?(text) text.split(AND_REGEX).reject(&:empty?).each do |str| result << regex_formatting(str) end else result << regex_formatting(text) end end end # This function converts the entire array with regex formatting into a regex string # The AND array is an adhoc format generated by Lexer as regex doesn't have a Lucene # Boolean `AND` equivalent. This adhoc regex AND is generated in the Lexer stack in the task generator def self.regex_array_to_string(obj) return obj if obj.is_a?(String) is_AND_array = is_AND_array?(obj) # This removes the "AND" from the AND array obj.shift if is_AND_array result = obj.collect do |text| regex_array_to_string(text) end return construct_AND_array(result) if is_AND_array result.join('') end def self.is_AND_array?(array) array[0] == 'AND' end def self.construct_AND_array(array) and_prefix = 'AND([' and_suffix = "])" internal_str = array.collect do |str| next "'" + str + "'" unless str.include?(and_prefix) str end.join(',') and_prefix + internal_str + and_suffix end def self.contain_AND?(obj) obj.is_a?(String) && obj.match(AND_REGEX) end def self.all_strings?(array) array.all? do |elem| elem.is_a?(String) end end # Is this text not in awe? lols. Rather, is it not wrapped in regex `or` . # ie. |pieceofcontent| = false # ie. |pieceofcontent = true # ie. pieceofcontent| = true # ie. pieceofcontent = true def self.not_in_or?(text) text[0] != '|' && text[-1] != '|' end def self.wrap_in_brackets(text) '(?:' + text + ')' end def self.get_bracket_regex @bracket_regex ||= generate_brack_regex end # This function generates the bracket regex. For simplicity, the regex for 'inside of a bracket' is represented by the # string 'a', and the regex outside of a bracket is represented by the string 'b'. These are then substituted out at the end. def self.generate_brack_regex puts "Loading bracket regex..." inside_brackets = "[^\(\)]*" not_open_bracket = "[^\(]*" get_bracket_inputs.collect do |input| bracket_input_to_brackets(input).gsub('a', inside_brackets).gsub('b', not_open_bracket) end.join('|') end # This function loads the valid permutations of the bracket regex where 0 represents an open bracket and 1 means closed bracket. # All poosible permutations of bracket ordering are generated and then only valid bracket orderings are selected. def self.get_bracket_inputs inputs = [] (0..2000).to_a.each do |v| result = v.to_s(2).split('').collect(&:to_i) inputs << result inputs << result.reverse unless result == result.reverse inputs.uniq! end inputs.select { |v| valid?(v) } end def self.valid?(input) # The total count of brackets must be even. return false unless input.size.even? # Only have 0's or 1's as inputs return false if input.any? { |v| ![0, 1].include?(v) } # The number of open brackets must equal the number of closed brackets return false unless input.inject(0) { |n, v| n + v } == input.size / 2 # Can't start with a close bracket or end with an open bracket return false if input.first == 1 || input.last == 0 sum = 0 valid = true input.each_with_index do |v, i| sum += v comparison = (i + 1) / 2 valid = false if sum > comparison end valid end # This function first converts the sequence of 0's and 1's to open and close brackets. # It then puts in a 'b' string between any close brackets that are followed by an open bracket. # Finally it compresses any consecutive 'a's into a single 'a' as they are idempotent(ie. aaa == a). def self.bracket_input_to_brackets(input) brackets = ["\\(a", "a\\)"] input.collect do |i| brackets[i] end.join('').gsub('\\)\\(', '\\)b\\(').gsub(/a+/, 'a') end end