lib/wayfarer/base.rb in wayfarer-0.4.6 vs lib/wayfarer/base.rb in wayfarer-0.4.7
- old
+ new
@@ -1,60 +1,138 @@
# frozen_string_literal: true
module Wayfarer
- class Base < ActiveJob::Base
- include Wayfarer::Middleware::Controller
+ # @!attribute [r] task
+ # @return [Wayfarer::Task] the current task
+ # @!attribute [r] uri
+ # @return [Addressable::URI] Parsed task URL
+ # @!attribute [r] user_agent
+ # @return [Object] the user agent that retrieved the page
+ # @!attribute [r] action
+ # @return [Symbol, Object] action that the task URL was routed to.
+ # @!attribute [r] params
+ # @return [HashWithIndifferentAccess] path parameters collected from routes
+ module Base
+ extend ActiveSupport::Concern
+ # @!method stage(urls)
+ # Adds URLs to an internal staging set so that they get enqueued
+ # eventually, once the job executed successfully.
+ # @overload stage(urls)
+ # @param urls [Array<String>] URLs to add to the staging set.
+ # @overload stage(url)
+ # @param url [String] URL to add to the staging set.
- use Wayfarer::Middleware::Stage
- use Wayfarer::Middleware::Dedup
- use Wayfarer::Middleware::Normalize
- use Wayfarer::Middleware::Router
- use Wayfarer::Middleware::Fetch
- use Wayfarer::Middleware::Dispatch
+ # @!method fetch(url, follow: 3)
+ # @param url [String] URL to fetch using plain HTTP(S).
+ # @param follow [Fixnum] Number of redirects to follow.
+ # Retrieves the given URL to a {Page}.
- ErrorHandler = lambda do |&block|
- lambda do |job, error|
- task = job.arguments.first
- task.barrier.seen?(task.url)
- task.gc.run
- block.call(job, error)
- end
- end
+ # @!method page(live: false)
+ # @param url [live] whether to retrieve a new {Page}.
+ # @return [Wayfarer::Page]
+ # Returns the most recently retrieved page or a new page
+ # for the current task URL if the `follow` keyword is passed.
- after_enqueue do |job|
- task = job.arguments.first
- task.counter.increment
- end
+ # @!scope class
- after_perform do |job|
- task = job.arguments.first
- task.gc.run
- end
+ # @!attribute [r] route
+ # @return [Wayfarer::Routing::DSL]
+ # The job's {Wayfarer::Routing::DSL} that maps URLs to instance methods
+ # or to a {Handler}.
+ # @example Append a host route
+ # route.host "examplxe.com", to: :index
- rescue_from(StandardError) do
- task = arguments.first
- task.gc.run
- end
+ # @!method content_types(*content_types)
+ # @param content_types [*Array<String, Regexp>] Content-Types to whitelist
+ # Whitelists Content-Types. Once at least one Content-Type is set, only
+ # those Content-Types will be processed.
- def self.retry_on(*argv, &block)
- super(*argv, &ErrorHandler.call(&block))
- end
+ # @!group Callbacks
- def self.discard_on(*argv, &block)
- super(*argv, &ErrorHandler.call(&block))
- end
+ # @!method before_fetch
+ # @overload before_fetch(callback)
+ # @param callback [Symbol] Instance method to call
+ # @overload before_fetch(&block)
+ # @yield [Wayfarer::Task]
+ # Registers a callback that is called before the page is fetched.
+ # If a symbol is passed, an instance method with the same name will be
+ # called.
+ # @example Accessing the user agent in {#before_fetch}
+ # before_fetch do |task|
+ # user_agent # => the user agent that will fetch the page
+ # end
- def self.crawl(url, batch: SecureRandom.uuid)
- Task.new(url, batch).tap do |task|
- perform_later(task)
- end
- end
+ # @!method around_fetch
+ # @overload around_fetch(callback)
+ # @param callback [Symbol] Instance method to call
+ # @overload around_fetch(&block)
+ # @yield [Wayfarer::Task]
+ # Registers a callback that is called around the page getting fetched.
+ # If a symbol is passed, an instance method with the same name will be
+ # called.
- def retry_job(...)
- super(...) # increments the counter by re-enqueuing the job
- task = arguments.first
- task.counter.decrement
+ # @!method after_fetch
+ # @overload after_fetch(callback)
+ # @param callback [Symbol] Instance method to call
+ # @overload after_fetch(&block)
+ # @yield [Wayfarer::Task]
+ # Registers a callback that is called after the page was fetched.
+ # If a symbol is passed, an instance method with the same name will be
+ # called.
+
+ # @!method before_perform
+ # @overload before_perform(callback)
+ # @param callback [Symbol] Instance method to call
+ # @overload before_perform(&block)
+ # @yield [Wayfarer::Task]
+ # Registers a callback that is called before the task is performed.
+ # If a symbol is passed, an instance method with the same name will be
+ # called.
+
+ # @!method around_perform
+ # @overload around_perform(callback)
+ # @param callback [Symbol] Instance method to call
+ # @overload around_perform(&block)
+ # @yield [Wayfarer::Task]
+ # Registers a callback that is called around the task getting performed.
+ # If a symbol is passed, an instance method with the same name will be
+ # called.
+
+ # @!method after_perform
+ # @overload after_perform(callback)
+ # @param callback [Symbol] Instance method to call
+ # @overload after_perform(&block)
+ # @yield [Wayfarer::Task]
+ # Registers a callback that is called after the task was performed.
+ # If a symbol is passed, an instance method with the same name will be
+ # called.
+
+ # @!endgroup
+
+ included do
+ include Wayfarer::Middleware::Controller
+
+ # Implement ActiveJob's #perform by calling into our own middleware chain
+ alias_method :perform, :call
+
+ # Middleware stack
+ use Wayfarer::Middleware::Redis
+ use Wayfarer::Middleware::BatchCompletion
+ use Wayfarer::Middleware::UriParser
+ use Wayfarer::Middleware::Normalize
+ use Wayfarer::Middleware::Dedup
+ use Wayfarer::Middleware::Stage
+ use Wayfarer::Middleware::Router
+ use Wayfarer::Middleware::UserAgent
+ use Wayfarer::Middleware::ContentType
+ use Wayfarer::Middleware::Dispatch
end
- alias perform call
+ class_methods do
+ def crawl(url, batch: SecureRandom.uuid)
+ Task.new(url, batch).tap do |task|
+ perform_later(task)
+ end
+ end
+ end
end
end