lib/pupa/runner.rb in pupa-0.0.7 vs lib/pupa/runner.rb in pupa-0.0.8

- old
+ new

@@ -1,6 +1,5 @@ -require 'fileutils' require 'optparse' require 'ostruct' require 'moped' @@ -17,10 +16,11 @@ actions: [], tasks: [], output_dir: File.expand_path('scraped_data', Dir.pwd), cache_dir: File.expand_path('web_cache', Dir.pwd), expires_in: 86400, # 1 day + validate: true, host_with_port: 'localhost:27017', database: 'pupa', dry_run: false, level: 'INFO', }.merge(defaults)) @@ -70,19 +70,22 @@ options.actions << v end opts.on('-t', '--task TASK', @processor_class.tasks, 'Select a scraping task to run (you may give this switch multiple times)', " (#{@processor_class.tasks.join(', ')})") do |v| options.tasks << v end - opts.on('-o', '--output_dir PATH', 'The directory in which to dump JSON documents') do |v| + opts.on('-o', '--output_dir PATH', 'The directory or Redis address (e.g. redis://localhost:6379) in which to dump JSON documents') do |v| options.output_dir = v end - opts.on('-c', '--cache_dir PATH', 'The directory in which to cache HTTP requests') do |v| + opts.on('-c', '--cache_dir PATH', 'The directory or Memcached address (e.g. memcached://localhost:11211) in which to cache HTTP requests') do |v| options.cache_dir = v end opts.on('-e', '--expires_in SECONDS', "The cache's expiration time in seconds") do |v| options.expires_in = v end + opts.on('--[no-]validate', 'Validate JSON documents') do |v| + options.validate = v + end opts.on('-H', '--host HOST:PORT', 'The host and port to MongoDB') do |v| options.host_with_port = v end opts.on('-d', '--database NAME', 'The name of the MongoDB database') do |v| options.database = v @@ -135,11 +138,16 @@ end if options.tasks.empty? options.tasks = @processor_class.tasks end - processor = @processor_class.new(options.output_dir, cache_dir: options.cache_dir, expires_in: options.expires_in, level: options.level, options: Hash[*rest]) + processor = @processor_class.new(options.output_dir, + cache_dir: options.cache_dir, + expires_in: options.expires_in, + validate: options.validate, + level: options.level, + options: Hash[*rest]) options.actions.each do |action| unless action == 'scrape' || processor.respond_to?(action) abort %(`#{action}` is not a #{opts.program_name} action. See `#{opts.program_name} --help` for a list of available actions.) end @@ -172,16 +180,10 @@ } Pupa.session = Moped::Session.new([options.host_with_port], database: options.database) if options.actions.delete('scrape') - FileUtils.mkdir_p(options.output_dir) - FileUtils.mkdir_p(options.cache_dir) - - Dir[File.join(options.output_dir, '*.json')].each do |path| - FileUtils.rm(path) - end - + processor.store.clear report[:scrape] = {} options.tasks.each do |task_name| report[:scrape][task_name] = processor.dump_scraped_objects(task_name) end end