require 'webrobots/robotstxt' require 'uri' require 'net/https' if defined?(Nokogiri) require 'webrobots/nokogiri' else autoload :Nokogiri, 'webrobots/nokogiri' end class WebRobots # Creates a WebRobots object for a robot named +user_agent+, with # optional +options+. # # * :http_get => a custom method, proc, or anything that responds to # .call(uri), to be used for fetching robots.txt. It must return # the response body if successful, or raise Net::HTTPNotFound if # the resource is not found. Any other errror is regarded as # blanket ban. def initialize(user_agent, options = nil) @user_agent = user_agent @parser = RobotsTxt::Parser.new(user_agent) options ||= {} @http_get = options[:http_get] || method(:http_get) @robotstxt = {} end # Returns the robot name initially given. attr_reader :user_agent # Tests if the robot is allowed to access a resource at +url+. If a # malformed URI string is given, URI::InvalidURIError is raised. If # a relative URI or a non-HTTP/HTTPS URI is given, ArgumentError is # raised. def allowed?(url) site, request_uri = split_uri(url) return true if request_uri == '/robots.txt' robots_txt(site).allow?(request_uri) end # Equivalent to !allowed?(url). def disallowed?(url) !allowed?(url) end # Returns extended option values for a resource at +url+ in a hash # with each field name lower-cased. See allowed?() for a list of # errors that may be raised. def options(url) site, = split_uri(url) robots_txt(site).options end # Equivalent to option(url)[token.downcase]. def option(url, token) options(url)[token.downcase] end # Returns an array of Sitemap URLs. See allowed?() for a list of # errors that may be raised. def sitemaps(url) site, = split_uri(url) robots_txt(site).sitemaps end private def split_uri(url) site = if url.is_a?(URI) url.dup else begin URI.parse(url) rescue => e raise ArgumentError, e.message end end site.scheme && site.host or raise ArgumentError, "non-absolute URI: #{url}" site.is_a?(URI::HTTP) or raise ArgumentError, "non-HTTP/HTTPS URI: #{url}" request_uri = site.request_uri if (host = site.host).match(/[[:upper:]]/) site.host = host.downcase end site.path = '/' return site, request_uri end def robots_txt(site) cache_robots_txt(site) { fetch_robots_txt(site) } end def fetch_robots_txt(site) begin body = @http_get.call(site + 'robots.txt') rescue Net::HTTPNotFound return '' end @parser.parse(body, site) end def cache_robots_txt(site, &block) if @robotstxt.key?(site) @robotstxt[site] else @robotstxt[site] = block.call(site) end end def http_get(uri) referer = nil 10.times { http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = uri.is_a?(URI::HTTPS) http.verify_mode = OpenSSL::SSL::VERIFY_NONE header = { 'User-Agent' => @user_agent } header['Referer'] = referer if referer # header is destroyed by this in ruby 1.9.2! response = http.get(uri.request_uri, header) case response when Net::HTTPSuccess return response.body when Net::HTTPRedirection referer = uri.to_s uri = URI(response['location']) else response.value end } raise 'too many HTTP redirects' end end