require 'nokogiri' require 'logger' require 'outlander/agent' require 'outlander/threads_pool' module Outlander module Crawler DEFAULT_OPTIONS = { num_threads: 3 } class << self def included(base) base.extend ClassMethods base.class_eval do @roots = {} @handlers = {} end end end module ClassMethods attr_reader :roots, :setup, :handlers def entrypoint(url, handler = :process_root) @roots[url] = handler end def before_start(&block) @setup = block end def method_missing(m, *args, &block) if m.to_s.start_with? 'process_' @handlers[m] = block else super end end end def initialize(options = {}) agent.cache_storage = options.delete(:cache_storage) @logger = Logger.new(options.fetch(:log_to, STDOUT)) @options = options.merge DEFAULT_OPTIONS @history = {} @pool = ThreadsPool.new @options[:num_threads] self.class.roots.each do |url, handler| enqueue url, handler end end def run!(&block) @result_handler = block instance_eval &self.class.setup @pool.start end private def record(data) @result_handler.call data end def enqueue(url, handler, *args) return if @history[url] == handler @pool.enqueue do begin body = agent.get_with_cache(url) instance_exec Nokogiri::HTML(body), *args, &self.class.handlers[handler.to_sym] rescue => e @logger.error "Failed to process #{url} with ##{handler} #{e.inspect}" else @logger.info "Processed #{url} with ##{handler}" end end @logger.info "Enqueued #{url} for ##{handler}" end def agent @agent ||= Agent.dup end end end