require 'dragonfly' # When using this gem, you'll start by defining a {Scraper}, with methods for # retrieving and processing data. The data will be stored in {DataStorage}; # this gem currently provides only a {DataStorage::FileDataStore FileDataStore}. # You may enhance a datastore with {Decorators} and {Observers}: for example, # a {Decorators::Timeout Timeout} decorator to retry on timeout with exponential # backoff and a {Observers::Log Log} observer which logs retrieval progress. # Of course, you must also define a {Processors Processor} to turn your raw data # into machine-readable data. # # A simple skeleton scraper: # # require 'unbreakable' # # class MyScraper < Unbreakable::Scraper # def retrieve(args) # # download all the documents # end # def processable # # return a list of documents to process # end # end # # class MyProcessor < Unbreakable::Processors::Transform # def perform # # return the transformed record as a hash, array, etc. # end # def persist(arg) # # store the hash/array/etc. in Mongo, MySQL, YAML, etc. # end # end # # scraper = MyScraper.new # scraper.processor.register MyProcessor # scraper.configure do |c| # # configure the scraper # end # scraper.run(ARGV) # # Every scraper script can run as a command-line script. Try it! # # $ ruby myscraper.rb # usage: irb [options] [] # # The most commonly used commands are: # retrieve Cache remote files to the datastore for later processing # process Process cached files into machine-readable data # config Print the current configuration # # Specific options: # --root_path ARG default "/var/tmp/unbreakable" # --[no-]store_meta default true # --cache_duration ARG default 31536000 # --fallback_mime_type ARG default "application/octet-stream" # --secret ARG default "secret yo" # --[no-]trust_file_extensions default true # # General options: # -h, --help Display this screen module Unbreakable autoload :Scraper, 'unbreakable/scraper' module Processors autoload :Transform, 'unbreakable/processors/transform' end module Observers autoload :Observer, 'unbreakable/observers/observer' autoload :Log, 'unbreakable/observers/log' end module Decorators autoload :Timeout, 'unbreakable/decorators/timeout' end module DataStorage autoload :FileDataStore, 'unbreakable/data_storage/file_data_store' end class UnbreakableError < StandardError; end class InvalidRemoteFile < UnbreakableError; end end