lib/wayfarer/base.rb in wayfarer-0.4.6 vs lib/wayfarer/base.rb in wayfarer-0.4.7

- old
+ new

@@ -1,60 +1,138 @@ # frozen_string_literal: true module Wayfarer - class Base < ActiveJob::Base - include Wayfarer::Middleware::Controller + # @!attribute [r] task + # @return [Wayfarer::Task] the current task + # @!attribute [r] uri + # @return [Addressable::URI] Parsed task URL + # @!attribute [r] user_agent + # @return [Object] the user agent that retrieved the page + # @!attribute [r] action + # @return [Symbol, Object] action that the task URL was routed to. + # @!attribute [r] params + # @return [HashWithIndifferentAccess] path parameters collected from routes + module Base + extend ActiveSupport::Concern + # @!method stage(urls) + # Adds URLs to an internal staging set so that they get enqueued + # eventually, once the job executed successfully. + # @overload stage(urls) + # @param urls [Array<String>] URLs to add to the staging set. + # @overload stage(url) + # @param url [String] URL to add to the staging set. - use Wayfarer::Middleware::Stage - use Wayfarer::Middleware::Dedup - use Wayfarer::Middleware::Normalize - use Wayfarer::Middleware::Router - use Wayfarer::Middleware::Fetch - use Wayfarer::Middleware::Dispatch + # @!method fetch(url, follow: 3) + # @param url [String] URL to fetch using plain HTTP(S). + # @param follow [Fixnum] Number of redirects to follow. + # Retrieves the given URL to a {Page}. - ErrorHandler = lambda do |&block| - lambda do |job, error| - task = job.arguments.first - task.barrier.seen?(task.url) - task.gc.run - block.call(job, error) - end - end + # @!method page(live: false) + # @param url [live] whether to retrieve a new {Page}. + # @return [Wayfarer::Page] + # Returns the most recently retrieved page or a new page + # for the current task URL if the `follow` keyword is passed. - after_enqueue do |job| - task = job.arguments.first - task.counter.increment - end + # @!scope class - after_perform do |job| - task = job.arguments.first - task.gc.run - end + # @!attribute [r] route + # @return [Wayfarer::Routing::DSL] + # The job's {Wayfarer::Routing::DSL} that maps URLs to instance methods + # or to a {Handler}. + # @example Append a host route + # route.host "examplxe.com", to: :index - rescue_from(StandardError) do - task = arguments.first - task.gc.run - end + # @!method content_types(*content_types) + # @param content_types [*Array<String, Regexp>] Content-Types to whitelist + # Whitelists Content-Types. Once at least one Content-Type is set, only + # those Content-Types will be processed. - def self.retry_on(*argv, &block) - super(*argv, &ErrorHandler.call(&block)) - end + # @!group Callbacks - def self.discard_on(*argv, &block) - super(*argv, &ErrorHandler.call(&block)) - end + # @!method before_fetch + # @overload before_fetch(callback) + # @param callback [Symbol] Instance method to call + # @overload before_fetch(&block) + # @yield [Wayfarer::Task] + # Registers a callback that is called before the page is fetched. + # If a symbol is passed, an instance method with the same name will be + # called. + # @example Accessing the user agent in {#before_fetch} + # before_fetch do |task| + # user_agent # => the user agent that will fetch the page + # end - def self.crawl(url, batch: SecureRandom.uuid) - Task.new(url, batch).tap do |task| - perform_later(task) - end - end + # @!method around_fetch + # @overload around_fetch(callback) + # @param callback [Symbol] Instance method to call + # @overload around_fetch(&block) + # @yield [Wayfarer::Task] + # Registers a callback that is called around the page getting fetched. + # If a symbol is passed, an instance method with the same name will be + # called. - def retry_job(...) - super(...) # increments the counter by re-enqueuing the job - task = arguments.first - task.counter.decrement + # @!method after_fetch + # @overload after_fetch(callback) + # @param callback [Symbol] Instance method to call + # @overload after_fetch(&block) + # @yield [Wayfarer::Task] + # Registers a callback that is called after the page was fetched. + # If a symbol is passed, an instance method with the same name will be + # called. + + # @!method before_perform + # @overload before_perform(callback) + # @param callback [Symbol] Instance method to call + # @overload before_perform(&block) + # @yield [Wayfarer::Task] + # Registers a callback that is called before the task is performed. + # If a symbol is passed, an instance method with the same name will be + # called. + + # @!method around_perform + # @overload around_perform(callback) + # @param callback [Symbol] Instance method to call + # @overload around_perform(&block) + # @yield [Wayfarer::Task] + # Registers a callback that is called around the task getting performed. + # If a symbol is passed, an instance method with the same name will be + # called. + + # @!method after_perform + # @overload after_perform(callback) + # @param callback [Symbol] Instance method to call + # @overload after_perform(&block) + # @yield [Wayfarer::Task] + # Registers a callback that is called after the task was performed. + # If a symbol is passed, an instance method with the same name will be + # called. + + # @!endgroup + + included do + include Wayfarer::Middleware::Controller + + # Implement ActiveJob's #perform by calling into our own middleware chain + alias_method :perform, :call + + # Middleware stack + use Wayfarer::Middleware::Redis + use Wayfarer::Middleware::BatchCompletion + use Wayfarer::Middleware::UriParser + use Wayfarer::Middleware::Normalize + use Wayfarer::Middleware::Dedup + use Wayfarer::Middleware::Stage + use Wayfarer::Middleware::Router + use Wayfarer::Middleware::UserAgent + use Wayfarer::Middleware::ContentType + use Wayfarer::Middleware::Dispatch end - alias perform call + class_methods do + def crawl(url, batch: SecureRandom.uuid) + Task.new(url, batch).tap do |task| + perform_later(task) + end + end + end end end