require 'forwardable' require 'optparse' require 'securerandom' require 'active_support/inflector/methods' module Unbreakable # You may implement a scraper by subclassing this class: # # require 'open-uri' # class MyScraper < Unbreakable::Scraper # # Stores the contents of +http://www.example.com/+ in +index.html+. # def retrieve # store(:path => 'index.html'){ open('http://www.example.com/').read } # end # # # Processes +index.html+. # def process # fetch('index.html').process(:transform).apply # end # # # Alternatively, you can just set the files to fetch, which will be # # processed using a +:transform+ processor which you must implement. # def processable # ['index.html'] # end # end # # To configure: # # scraper.configure do |c| # c.datastore = MyDataStore.new # default Unbreakable::DataStorage::FileDataStore.new(scraper) # c.log = Logger.new('/path/to/file') # default Logger.new(STDOUT) # c.datastore.store_meta = true # default false # end # # The following instance methods must be implemented in sub-classes: # # * +retrieve+ # * +process+ or +processable+ class Scraper extend Forwardable def_delegators :@app, :add_child_configurable, :configure, :datastore, :fetch, :log, :processor # Initializes a Dragonfly app for storage and processing. def initialize @app = Dragonfly[SecureRandom.hex.to_sym] # defaults to Logger.new('/var/tmp/dragonfly.log') @app.log = Logger.new(STDOUT) # defaults to Dragonfly::DataStorage::FileDataStore.new @app.datastore = Unbreakable::DataStorage::FileDataStore.new(self) # defaults to '/var/tmp/dragonfly' @app.datastore.root_path = '/var/tmp/unbreakable' # defaults to true @app.datastore.store_meta = false end # Returns an option parser. # @return [OptionParser] an option parser def opts if @opts.nil? @opts = OptionParser.new @opts.banner = <<-eos usage: #{@opts.program_name} [options] [] The most commonly used commands are: retrieve Cache remote files to the datastore for later processing process Process cached files into machine-readable data config Print the current configuration eos @opts.separator '' @opts.separator 'Specific options:' extract_configuration @app @opts.separator '' @opts.separator 'General options:' @opts.on_tail('-h', '--help', 'Display this screen') do puts @opts exit end end @opts end # Runs the command. Most often run from a command-line script as: # # scraper.run(ARGV) # # @param [Array] args command-line arguments # @note Only call this method once per scraper instance. def run(args) opts.parse!(args) command = args.shift case command when 'retrieve' retrieve when 'process' process when 'config' print_configuration @app when nil puts opts else opts.abort "'#{command}' is not a #{opts.program_name} command. See '#{opts.program_name} --help'." end end # Stores a record in the datastore. # @param [Hash] opts options to pass to the datastore # @param [Proc] block a block that yields the contents of the file def store(opts = {}, &block) datastore.defer_store(opts, &block) end # Parses a JSON, HTML, XML, or YAML file. # @param [String, Dragonfly::TempObject] temp_object_or_uid a +TempObject+ or record ID # @param [String] encoding a file encoding # @return the parsing, either a Ruby or +Nokogiri+ type # @raise [LoadError] if the {http://nokogiri.org/ nokogiri} gem is # unavailable for parsing an HTML or XML file def parse(temp_object_or_uid, encoding = 'utf-8') temp_object = temp_object_or_uid.is_a?(Dragonfly::TempObject) ? temp_object_or_uid : fetch(temp_object_or_uid) string = temp_object.data case File.extname temp_object.path when '.json' begin require 'yajl' Yajl::Parser.parse string rescue LoadError require 'json' JSON.parse string end when '.html' require 'nokogiri' Nokogiri::HTML string, nil, encoding when '.xml' require 'nokogiri' Nokogiri::XML string, nil, encoding when '.yml', '.yaml' require 'yaml' YAML.load string else string end end # Caches remote files to the datastore for later processing. def retrieve raise NotImplementedError end # Processes cached files into machine-readable data. def process processable.each do |record| fetch(record).process(:transform).apply end end # Returns a list of record IDs to process. # @return [Array] a list of record IDs to process def processable raise NotImplementedError end private # @param [#configuration] object def extract_configuration(object) object.default_configuration.merge(object.configuration).each do |key,value| if true === value or false === value @opts.on("--[no-]#{key}", "default #{value.inspect}") do |x| object.send "#{key}=", x end elsif String === value or Fixnum === value @opts.on("--#{key} ARG", "default #{value.inspect}") do |x| object.send "#{key}=", x end elsif object != value and value.respond_to? :configuration extract_configuration value end end end # @param [#configuration] object def print_configuration(object, indent = 0) indentation = ' ' * indent puts "#{indentation}#{object.class.name}:" object.default_configuration.merge(object.configuration).each do |key,value| if true === value or false === value or String === value or Fixnum === value puts " #{indentation}#{key.to_s.ljust 25 - indent}#{value.inspect}" elsif object != value and value.respond_to? :configuration print_configuration value, indent + 2 end end end end end