lib/pupa/runner.rb in pupa-0.0.7 vs lib/pupa/runner.rb in pupa-0.0.8
- old
+ new
@@ -1,6 +1,5 @@
-require 'fileutils'
require 'optparse'
require 'ostruct'
require 'moped'
@@ -17,10 +16,11 @@
actions: [],
tasks: [],
output_dir: File.expand_path('scraped_data', Dir.pwd),
cache_dir: File.expand_path('web_cache', Dir.pwd),
expires_in: 86400, # 1 day
+ validate: true,
host_with_port: 'localhost:27017',
database: 'pupa',
dry_run: false,
level: 'INFO',
}.merge(defaults))
@@ -70,19 +70,22 @@
options.actions << v
end
opts.on('-t', '--task TASK', @processor_class.tasks, 'Select a scraping task to run (you may give this switch multiple times)', " (#{@processor_class.tasks.join(', ')})") do |v|
options.tasks << v
end
- opts.on('-o', '--output_dir PATH', 'The directory in which to dump JSON documents') do |v|
+ opts.on('-o', '--output_dir PATH', 'The directory or Redis address (e.g. redis://localhost:6379) in which to dump JSON documents') do |v|
options.output_dir = v
end
- opts.on('-c', '--cache_dir PATH', 'The directory in which to cache HTTP requests') do |v|
+ opts.on('-c', '--cache_dir PATH', 'The directory or Memcached address (e.g. memcached://localhost:11211) in which to cache HTTP requests') do |v|
options.cache_dir = v
end
opts.on('-e', '--expires_in SECONDS', "The cache's expiration time in seconds") do |v|
options.expires_in = v
end
+ opts.on('--[no-]validate', 'Validate JSON documents') do |v|
+ options.validate = v
+ end
opts.on('-H', '--host HOST:PORT', 'The host and port to MongoDB') do |v|
options.host_with_port = v
end
opts.on('-d', '--database NAME', 'The name of the MongoDB database') do |v|
options.database = v
@@ -135,11 +138,16 @@
end
if options.tasks.empty?
options.tasks = @processor_class.tasks
end
- processor = @processor_class.new(options.output_dir, cache_dir: options.cache_dir, expires_in: options.expires_in, level: options.level, options: Hash[*rest])
+ processor = @processor_class.new(options.output_dir,
+ cache_dir: options.cache_dir,
+ expires_in: options.expires_in,
+ validate: options.validate,
+ level: options.level,
+ options: Hash[*rest])
options.actions.each do |action|
unless action == 'scrape' || processor.respond_to?(action)
abort %(`#{action}` is not a #{opts.program_name} action. See `#{opts.program_name} --help` for a list of available actions.)
end
@@ -172,16 +180,10 @@
}
Pupa.session = Moped::Session.new([options.host_with_port], database: options.database)
if options.actions.delete('scrape')
- FileUtils.mkdir_p(options.output_dir)
- FileUtils.mkdir_p(options.cache_dir)
-
- Dir[File.join(options.output_dir, '*.json')].each do |path|
- FileUtils.rm(path)
- end
-
+ processor.store.clear
report[:scrape] = {}
options.tasks.each do |task_name|
report[:scrape][task_name] = processor.dump_scraped_objects(task_name)
end
end