# Author:: Robert Dormer (mailto:rdormer@gmail.com) # Copyright:: Copyright (c) 2016 Robert Dormer # License:: MIT #============================================== # This is the class that parses robots.txt and implements the exclusion # checking logic therein. Works by breaking the file up into a hash of # arrays of directives for each specified user agent, and then parsing the # directives into internal arrays and iterating through the list to find a # match. Urls are matched case sensitive, everything else is case insensitive. # The root url is treated as a special case by using a token for it. #============================================== require 'cgi' module Spider class ExclusionParser attr_accessor :wait_time DISALLOW = "disallow" DELAY = "crawl-delay" ALLOW = "allow" MAX_DIRECTIVES = 1000 NULL_MATCH = "*!*" def initialize(text, agent=nil) @skip_list = [] @agent_key = agent return if text.nil? || text.length.zero? if [401, 403].include? text.http_status @skip_list << [NULL_MATCH, true] return end begin config = parse_text(text) grab_list(config) rescue end end # Check to see if the given url is matched by any rule # in the file, and return it's associated status def excluded?(url) url = safe_unescape(url) @skip_list.each do |entry| return entry.last if url.include? entry.first return entry.last if entry.first == NULL_MATCH end false end def allowed?(url) !excluded?(url) end private # Method to process the list of directives for a given user agent. # Picks the one that applies to us, and then processes it's directives # into the skip list by splitting the strings and taking the appropriate # action. Stops after a set number of directives to avoid malformed files # or denial of service attacks def grab_list(config) section = (config.include?(@agent_key) ? config[@agent_key] : config['*']) if(section.length > MAX_DIRECTIVES) section.slice!(MAX_DIRECTIVES, section.length) end section.each do |pair| key, value = pair.split(':') next if key.nil? || value.nil? || key.empty? || value.empty? key.downcase! key.lstrip! key.rstrip! value.lstrip! value.rstrip! disallow(value) if key == DISALLOW delay(value) if key == DELAY allow(value) if key == ALLOW end end # Top level file parsing method - makes sure carriage returns work, # strips out any BOM, then loops through each line and opens up a new # array of directives in the hash if a user-agent directive is found def parse_text(text) current_key = "" config = {} text.gsub!("\r", "\n") text.gsub!("\xEF\xBB\xBF".force_encoding("ASCII-8BIT"), '') text.each_line do |line| line.lstrip! line.rstrip! line.gsub! /#.*/, '' if line.length.nonzero? && line =~ /[^\s]/ if line =~ /User-agent:\s+(.+)/i current_key = $1.downcase config[current_key] = [] unless config[current_key] next end config[current_key] << line end end config end def disallow(value) token = (value == "/" ? NULL_MATCH : value.chomp('*')) @skip_list << [safe_unescape(token), true] end def allow(value) token = (value == "/" ? NULL_MATCH : value.chomp('*')) @skip_list << [safe_unescape(token), false] end def delay(value) @wait_time = WaitTime.new(value.to_i) end def safe_unescape(target) t = target.gsub /%2f/, '^^^' t = CGI.unescape(t) t.gsub /\^\^\^/, '%2f' end end end