# encoding: utf-8 module Boogex def self.convert(text) puts "Converting \"#{text}\" into regex" array = array_struct(text) array = ors_to_pipes(array) array = regex_formatting(array) regex_array_to_string(array) end private # This function converts a string into an array where brackets in the string are converted to an array structure # to allow further manipulation # "a OR (b) OR c" => ["a OR ", ["b"], " OR c"] # "a OR (b AND (c OR d)) OR e" => ["a OR ", ["b AND ", ["c OR d"]], " OR e"] def self.array_struct(text) inside_brackets = "[^\(\)]*" #This regex looks for anything in brackets OR anything with brackets in brackets OR anything with brackets in brackets in brackets regex = /(\(#{inside_brackets}\))|(\(#{inside_brackets}\(#{inside_brackets}\)#{inside_brackets}\))|(\(#{inside_brackets}\(#{inside_brackets}\(#{inside_brackets}\)#{inside_brackets}\)#{inside_brackets}\))/ cuts = text.scan(regex).to_a.flatten.reject(&:nil?) # If nothing found then return orignal text return text if cuts.empty? # The text is now cut into an array where the bracketing of the string determines the elements # ie. "a OR (b) OR c" => ["a OR ", ["b"], " OR c"] text_array = cuts.inject([text]) do |a, cut| a.each_with_object([]) do |str, result| if !str.include?(cut) result << str else splits = str.split(cut) result << splits.first cut_without_brackets = cut[1..-2] result << [cut_without_brackets] result << splits.last end end end.uniq # This recursively converts any brackets in the text back into the array_struct function # where the upper limit of recursion is 3 levels of bracketing. This is limitied by the regex # on line 9 but can easily be extended. # If the element of the array is a string then no recursion to apply. # If the element of the array is an array then iterate THAT through the array_struct function text_array.reject(&:empty?).each_with_object([]) do |str, result| if str.is_a?(String) result << str next end result << str.collect do |str| array_struct(str) end end end # This function converts the Lucene Boolean `OR` into regex `|` and removes any quotation marks def self.ors_to_pipes(obj) return obj.gsub(' OR ', '|').gsub('"', '').gsub("'", '') if obj.is_a?(String) # This recursively applies this function to ensure all levels of the array are converted obj.collect do |text| ors_to_pipes(text) end end # This function begins to tranform the elements of the array structure to regex formatting # including: # - (a) Any elements that are not bookended by | are then wrapped in (?:) as this modularises # the regex of the elements of the structures # # - (b) Converting any Lucene Boolean `AND` into an AND array structure where the first element is "AND" and the # remaining elements of that array are the regexes that make up the `AND` # ie. ["pete AND james"] => ["AND", "pete", "james"] # ie. ["jenny AND", ["billy OR jimmy"]] => ["AND", "jenny", ["billy OR jimmy"]] def self.regex_formatting(obj) # (a) # if string then wrap it in brackets if needed and then return if obj.is_a?(String) if contain_AND?(obj) result = ['AND'] result = result + obj.split(' AND ').reject(&:empty?).collect do |str| regex_formatting(str) end return result end needs_brackets = not_in_or?(obj) obj = wrap_in_brackets(obj) if needs_brackets return obj end # if an all string array, then check if any of the elements of the array need bracket wrapped and return if all_strings?(obj) needs_brackets = obj.any? do |text| not_in_or?(text) end obj = obj.join('') obj = wrap_in_brackets(obj) if needs_brackets return obj end # (b) result = [] # If this level of bracket contains a string with `AND` in it, then consider this element an `AND` array result << 'AND' if obj.any? do |elem| contain_AND?(elem) end obj.each_with_object(result) do |text, result| if contain_AND?(text) text.split(' AND ').reject(&:empty?).each do |str| result << regex_formatting(str) end else result << regex_formatting(text) end end end # This function converts the entire array with regex formatting into a regex string # The AND array is an adhoc format generated by Lexer as regex doesn't have a Lucene # Boolean `AND` equivalent. This adhoc regex AND is generated in the Lexer stack in the task generator def self.regex_array_to_string(obj) return obj if obj.is_a?(String) is_AND_array = is_AND_array?(obj) # This removes the "AND" from the AND array obj.shift if is_AND_array result = obj.collect do |text| regex_array_to_string(text) end return construct_AND_array(result) if is_AND_array result.join('') end def self.is_AND_array?(array) array[0] == 'AND' end def self.construct_AND_array(array) '#{andify["' + array.join('", "') + '"]}' end def self.contain_AND?(obj) obj.is_a?(String) && obj.include?(' AND ') end def self.all_strings?(array) array.all? do |elem| elem.is_a?(String) end end # Is this text not in awe? lols. Rather, is it not wrapped in regex `or` . # ie. |pieceofcontent| = false # ie. |pieceofcontent = true # ie. pieceofcontent| = true # ie. pieceofcontent = true def self.not_in_or?(text) text[0] != '|' && text[-1] != '|' end def self.wrap_in_brackets(text) '(?:' + text + ')' end end