require 'spidr/rules' module Spidr # # The {Filters} module adds methods to {Agent} for controlling which # URLs the agent will visit. # module Filters def self.included(base) base.module_eval do # List of acceptable URL schemes to follow attr_reader :schemes end end # # Initializes filtering rules. # # @param [Hash] options # Additional options. # # @option options [Array] :schemes (['http', 'https']) # The list of acceptable URI schemes to visit. # The `https` scheme will be ignored if `net/https` cannot be loaded. # # @option options [String] :host # The host-name to visit. # # @option options [Array] :hosts # The patterns which match the host-names to visit. # # @option options [Array] :ignore_hosts # The patterns which match the host-names to not visit. # # @option options [Array] :ports # The patterns which match the ports to visit. # # @option options [Array] :ignore_ports # The patterns which match the ports to not visit. # # @option options [Array] :links # The patterns which match the links to visit. # # @option options [Array] :ignore_links # The patterns which match the links to not visit. # # @option options [Array] :urls # The patterns which match the URLs to visit. # # @option options [Array] :ignore_urls # The patterns which match the URLs to not visit. # # @option options [Array] :exts # The patterns which match the URI path extensions to visit. # # @option options [Array] :ignore_exts # The patterns which match the URI path extensions to not visit. # def initialize(options={}) super(options) @schemes = [] if options[:schemes] @schemes += options[:schemes] else @schemes << 'http' begin require 'net/https' @schemes << 'https' rescue Gem::LoadError => e raise(e) rescue ::LoadError STDERR.puts "Warning: cannot load 'net/https', https support disabled" end end @host_rules = Rules.new( :accept => options[:hosts], :reject => options[:ignore_hosts] ) @port_rules = Rules.new( :accept => options[:ports], :reject => options[:ignore_ports] ) @link_rules = Rules.new( :accept => options[:links], :reject => options[:ignore_links] ) @url_rules = Rules.new( :accept => options[:urls], :reject => options[:ignore_urls] ) @ext_rules = Rules.new( :accept => options[:exts], :reject => options[:ignore_exts] ) if options[:host] visit_hosts_like(options[:host]) end if options[:queue] self.queue = options[:queue] end if options[:history] self.history = options[:history] end end # # Sets the list of acceptable URL schemes to visit. # # @param [Array] new_schemes # The new schemes to visit. # # @example # agent.schemes = ['http'] # def schemes=(new_schemes) @schemes = new_schemes.map { |scheme| scheme.to_s } end # # Specifies the patterns that match host-names to visit. # # @return [Array] # The host-name patterns to visit. # def visit_hosts @host_rules.accept end # # Adds a given pattern to the {#visit_hosts}. # # @param [String, Regexp] pattern # The pattern to match host-names with. # # @yield [host] # If a block is given, it will be used to filter host-names. # # @yieldparam [String] host # A host-name to accept or reject. # def visit_hosts_like(pattern=nil,&block) if pattern visit_hosts << pattern elsif block visit_hosts << block end return self end # # Specifies the patterns that match host-names to not visit. # # @return [Array] # The host-name patterns to not visit. # def ignore_hosts @host_rules.reject end # # Adds a given pattern to the {#ignore_hosts}. # # @param [String, Regexp] pattern # The pattern to match host-names with. # # @yield [host] # If a block is given, it will be used to filter host-names. # # @yieldparam [String] host # A host-name to reject or accept. # def ignore_hosts_like(pattern=nil,&block) if pattern ignore_hosts << pattern elsif block ignore_hosts << block end return self end # # Specifies the patterns that match the ports to visit. # # @return [Array] # The port patterns to visit. # def visit_ports @port_rules.accept end # # Adds a given pattern to the {#visit_ports}. # # @param [Integer, Regexp] pattern # The pattern to match ports with. # # @yield [port] # If a block is given, it will be used to filter ports. # # @yieldparam [Integer] port # A port to accept or reject. # def visit_ports_like(pattern=nil,&block) if pattern visit_ports << pattern elsif block visit_ports << block end return self end # # Specifies the patterns that match ports to not visit. # # @return [Array] # The port patterns to not visit. # def ignore_ports @port_rules.reject end # # Adds a given pattern to the {#ignore_ports}. # # @param [Integer, Regexp] pattern # The pattern to match ports with. # # @yield [port] # If a block is given, it will be used to filter ports. # # @yieldparam [Integer] port # A port to reject or accept. # def ignore_ports_like(pattern=nil,&block) if pattern ignore_ports << pattern elsif block ignore_ports << block end return self end # # Specifies the patterns that match the links to visit. # # @return [Array] # The link patterns to visit. # # @since 0.2.4 # def visit_links @link_rules.accept end # # Adds a given pattern to the {#visit_links} # # @param [String, Regexp] pattern # The pattern to match link with. # # @yield [link] # If a block is given, it will be used to filter links. # # @yieldparam [String] link # A link to accept or reject. # # @since 0.2.4 # def visit_links_like(pattern=nil,&block) if pattern visit_links << pattern elsif block visit_links << block end return self end # # Specifies the patterns that match links to not visit. # # @return [Array] # The link patterns to not visit. # def ignore_links @link_rules.reject end # # Adds a given pattern to the {#ignore_links}. # # @param [String, Regexp] pattern # The pattern to match links with. # # @yield [link] # If a block is given, it will be used to filter links. # # @yieldparam [String] link # A link to reject or accept. # def ignore_links_like(pattern=nil,&block) if pattern ignore_links << pattern elsif block ignore_links << block end return self end # # Specifies the patterns that match the URLs to visit. # # @return [Array] # The link patterns to visit. # # @since 0.2.4 # def visit_urls @url_rules.accept end # # Adds a given pattern to the {#visit_urls} # # @param [String, Regexp] pattern # The pattern to match URLs with. # # @yield [url] # If a block is given, it will be used to filter URLs. # # @yieldparam [URI::HTTP, URI::HTTPS] url # A URL to accept or reject. # # @since 0.2.4 # def visit_urls_like(pattern=nil,&block) if pattern visit_urls << pattern elsif block visit_urls << block end return self end # # Specifies the patterns that match URLs to not visit. # # @return [Array] # The URL patterns to not visit. # # @since 0.2.4 # def ignore_urls @url_rules.reject end # # Adds a given pattern to the {#ignore_urls}. # # @param [String, Regexp] pattern # The pattern to match URLs with. # # @yield [url] # If a block is given, it will be used to filter URLs. # # @yieldparam [URI::HTTP, URI::HTTPS] url # A URL to reject or accept. # # @since 0.2.4 # def ignore_urls_like(pattern=nil,&block) if pattern ignore_urls << pattern elsif block ignore_urls << block end return self end # # Specifies the patterns that match the URI path extensions to visit. # # @return [Array] # The URI path extensions patterns to visit. # def visit_exts @ext_rules.accept end # # Adds a given pattern to the {#visit_exts}. # # @param [String, Regexp] pattern # The pattern to match URI path extensions with. # # @yield [ext] # If a block is given, it will be used to filter URI path extensions. # # @yieldparam [String] ext # A URI path extension to accept or reject. # def visit_exts_like(pattern=nil,&block) if pattern visit_exts << pattern elsif block visit_exts << block end return self end # # Specifies the patterns that match URI path extensions to not visit. # # @return [Array] # The URI path extension patterns to not visit. # def ignore_exts @ext_rules.reject end # # Adds a given pattern to the {#ignore_exts}. # # @param [String, Regexp] pattern # The pattern to match URI path extensions with. # # @yield [ext] # If a block is given, it will be used to filter URI path extensions. # # @yieldparam [String] ext # A URI path extension to reject or accept. # def ignore_exts_like(pattern=nil,&block) if pattern ignore_exts << pattern elsif block ignore_exts << block end return self end protected # # Determines if a given URI scheme should be visited. # # @param [String] scheme # The URI scheme. # # @return [Boolean] # Specifies whether the given scheme should be visited. # def visit_scheme?(scheme) if scheme return @schemes.include?(scheme) else return true end end # # Determines if a given host-name should be visited. # # @param [String] host # The host-name. # # @return [Boolean] # Specifies whether the given host-name should be visited. # def visit_host?(host) @host_rules.accept?(host) end # # Determines if a given port should be visited. # # @param [Integer] port # The port number. # # @return [Boolean] # Specifies whether the given port should be visited. # def visit_port?(port) @port_rules.accept?(port) end # # Determines if a given link should be visited. # # @param [String] link # The link. # # @return [Boolean] # Specifies whether the given link should be visited. # def visit_link?(link) @link_rules.accept?(link) end # # Determines if a given URL should be visited. # # @param [URI::HTTP, URI::HTTPS] url # The URL. # # @return [Boolean] # Specifies whether the given URL should be visited. # # @since 0.2.4 # def visit_url?(link) @url_rules.accept?(link) end # # Determines if a given URI path extension should be visited. # # @param [String] path # The path that contains the extension. # # @return [Boolean] # Specifies whether the given URI path extension should be visited. # def visit_ext?(path) @ext_rules.accept?(File.extname(path)[1..-1]) end end end