require 'forwardable' require 'optparse' require 'securerandom' require 'active_support/core_ext/class/attribute_accessors' module Unbreakable # You may implement a scraper by subclassing this class: # # require 'open-uri' # class MyScraper < Unbreakable::Scraper # # Stores the contents of +http://www.example.com/+ in +index.html+. # def retrieve(args) # store(:path => 'index.html'){ open('http://www.example.com/').read } # end # # # Processes +index.html+. # def process(args) # fetch('index.html').process(:transform).apply # end # # # Alternatively, you can just set the files to fetch, which will be # # processed using a +:transform+ processor which you must implement. # def processable # ['index.html'] # end # end # # To configure: # # scraper.configure do |c| # c.datastore = MyDataStore.new # default Unbreakable::DataStorage::FileDataStore.new(scraper) # c.log = Logger.new('/path/to/file') # default Logger.new(STDOUT) # c.datastore.store_meta = true # default false # end # # The following instance methods must be implemented in sub-classes: # # * +retrieve+ # * +process+ or +processable+ class Scraper extend Forwardable def_delegators :@app, :add_child_configurable, :configure, :datastore, :fetch, :log, :processor cattr_accessor :commands @@commands = [] # Initializes a Dragonfly app for storage and processing. def initialize @app = Dragonfly[SecureRandom.hex.to_sym] # defaults to Logger.new('/var/tmp/dragonfly.log') @app.log = Logger.new(STDOUT) # defaults to Dragonfly::DataStorage::FileDataStore.new @app.datastore = Unbreakable::DataStorage::FileDataStore.new(self) # defaults to '/var/tmp/dragonfly' @app.datastore.root_path = '/var/tmp/unbreakable' # defaults to true @app.datastore.store_meta = false end # Returns an option parser. # @return [OptionParser] an option parser def opts if @opts.nil? @opts = OptionParser.new @opts.banner = <<-eos usage: #{@opts.program_name} [options] [] The most commonly used commands are: retrieve Cache remote files to the datastore for later processing process Process cached files into machine-readable data config Print the current configuration eos @opts.separator '' @opts.separator 'Specific options:' specific_options extract_configuration @app @opts.separator '' @opts.separator 'General options:' general_options @opts.on_tail('-h', '--help', 'Display this screen') do puts @opts exit end end @opts end # def specific_options # @opts.on('--echo ARG', 'Write a string to standard output') do |x| # puts x # end # end # # @abstract Override to add specific options to the option parser. def specific_options; end # def general_options # @opts.on('--echo ARG', 'Write a string to standard output') do |x| # puts x # end # end # # @abstract Override to add general options to the option parser. def general_options; end # Runs the command. Most often run from a command-line script as: # # scraper.run(ARGV) # # @param [Array] args command-line arguments # @note Only call this method once per scraper instance. def run(args) opts.parse!(args) command = args.shift case command when 'retrieve' retrieve(args) when 'process' process(args) when 'config' print_configuration @app when nil puts opts else # Allow subclasses to add more commands. if self.commands.include? command.to_sym send command, args else opts.abort "'#{command}' is not a #{opts.program_name} command. See '#{opts.program_name} --help'." end end end # Stores a record in the datastore. # @param [Hash] opts options to pass to the datastore # @param [Proc] block a block that yields the contents of the file def store(opts = {}, &block) datastore.defer_store(opts, &block) end # Parses a JSON, HTML, XML, or YAML file. # @param [String, Dragonfly::TempObject] temp_object_or_uid a +TempObject+ or record ID # @param [String] encoding a file encoding # @return the parsing, either a Ruby or +Nokogiri+ type # @raise [LoadError] if the {http://nokogiri.org/ nokogiri} gem is # unavailable for parsing an HTML or XML file def parse(temp_object_or_uid, encoding = 'utf-8') temp_object = temp_object_or_uid.is_a?(Dragonfly::TempObject) ? temp_object_or_uid : fetch(temp_object_or_uid) string = temp_object.data case File.extname temp_object.path when '.json' begin require 'yajl' Yajl::Parser.parse string rescue LoadError require 'json' JSON.parse string end when '.html' require 'nokogiri' Nokogiri::HTML string, nil, encoding when '.xml' require 'nokogiri' Nokogiri::XML string, nil, encoding when '.yml', '.yaml' require 'yaml' YAML.load string else string end end # Caches remote files to the datastore for later processing. # @param [Array] args command-line arguments def retrieve(args) raise NotImplementedError end # Processes cached files into machine-readable data. # @param [Array] args command-line arguments def process(args) processable.each do |record| fetch(record).process(:transform, :args => args).apply end end # Returns a list of record IDs to process. # @return [Array] a list of record IDs to process def processable raise NotImplementedError end private # @param [#configuration] object def extract_configuration(object) object.config_methods.each do |meth| default = object.configuration[meth] || object.default_configuration[meth] if true === default or false === default @opts.on("--[no-]#{meth}", "default #{default.inspect}") do |x| object.configure{|c| c.send "#{meth}=", x} end elsif String === default or Fixnum === default @opts.on("--#{meth} ARG", "default #{default.inspect}") do |x| object.configure{|c| c.send "#{meth}=", x} end elsif object != default and default.respond_to? :configuration extract_configuration default end end end # @param [#configuration] object def print_configuration(object, indent = 0) indentation = ' ' * indent puts "#{indentation}#{object.class.name}:" object.config_methods.each do |meth| default = object.configuration[meth] || object.default_configuration[meth] if true === default or false === default or String === default or Fixnum === default puts " #{indentation}#{meth.to_s.ljust 25 - indent}#{default.inspect}" elsif object != default and default.respond_to? :configuration print_configuration default, indent + 2 end end end end end