# = Text Extraction Class # # Extractor is was designed particulalry for extracting source code from embedded # comment blocks. # # Todo: # - How can we handle embedded code in stadanrd comments? Eg. # # class Regex VERSION = "1.1" # When the regular expression return multiple groups, # each is divided by the group deliminator. # This is the default value. DELIMINATOR_GROUP = 29.chr + "\n" # When using repeat mode, each match is divided by # the record deliminator. This is the default value. DELIMINATOR_RECORD = 30.chr + "\n" require 'fileutils' require 'open-uri' require 'regex/string' require 'regex/command' # TODO: generalize to plugin require 'regex/templates/common' # #attr_accessor :text # Remove XML tags from search. attr_accessor :unxml # Regular expression. attr_accessor :pattern # Select built-in regular expression by name. attr_accessor :template # Index of expression return. attr_accessor :index # Ignore case. attr_accessor :insensitive # Repeat Match. attr_accessor :repeat # Output format. attr_accessor :format # DEPRECATE: Not needed anymore. #def self.load(io, options={}, &block) # new(io, options, &block) #end # New extractor. def initialize(io, options={}) @raw = (String === io ? io : io.read) options.each do |k,v| __send__("#{k}=", v) end yield(self) if block_given? end # Read file. #def raw # @raw ||= open(@file) # File.read(@file) #end #-- # TODO: unxml is too primative, use real xml parser like nokogiri #++ def text @text ||= ( if unxml raw.gsub!(/\<(.*?)\>/, '') else @raw end ) end # def regex @regex ||= ( if template TEMPLATES.const_get(template.upcase) else case pattern when Regexp pattern when String flags = [] flags << Regexp::MULTILINE flags << Regexp::IGNORECASE if insensitive Regexp.new(pattern, *flags) end end ) end # def to_s(format=nil) case format when :yaml to_s_yaml when :json to_s_json else out = structure if repeat out = out.map{ |m| m.join(deliminator_group) } out = out.join(deliminator_record) #.chomp("\n") + "\n" else out = out.join(deliminator_group) #.chomp("\n") + "\n" end out end end # def to_s_yaml require 'yaml' structure.to_yaml end # def to_s_json begin require 'json' rescue LoadError require 'json_pure' end structure.to_json end # Structure the matchdata according to specified options. def structure repeat ? structure_repeat : structure_single end # Structure the matchdata for single match. def structure_single md = extract if index [md[index]] elsif md.size > 1 md[1..-1] else [md[0]] end end # Structure the matchdata for repeat matches. def structure_repeat out = extract if index out.map{ |md| [md[index]] } else out.map{ |md| md.size > 1 ? md[1..-1] : [md[0]] } end end # Extract match from source text. def extract if repeat extract_repeat else extract_single end end # #def extract_single # out = [] # if md = matchdata # if index # out << md[index] # elsif md.size > 1 # out = md[1..-1] #.join(deliminator_group) # else # out = md # end # end # return out #end # Extract single match from source text. def extract_single md = regex.match(text) md ? md : [] end # #def matchdata # regex.match(text) #end # #def extract_repeat # out = [] # text.scan(regex) do # md = $~ # if index # out << [md[index]] # elsif md.size > 1 # out << md[1..-1] #.join(deliminator_group) # else # out << md # end # end # out #.join(deliminator_record) #end # Extract repeat matches from source text. def extract_repeat out = [] text.scan(regex) do out << $~ end out end def deliminator_group DELIMINATOR_GROUP end def deliminator_record DELIMINATOR_RECORD end end