lib/exclusion.rb in spiderkit-0.1.2 vs lib/exclusion.rb in spiderkit-0.2.0

- old
+ new

@@ -2,148 +2,159 @@ # Copyright:: Copyright (c) 2016 Robert Dormer # License:: MIT #============================================== # This is the class that parses robots.txt and implements the exclusion -# checking logic therein. Works by breaking the file up into a hash of +# checking logic therein. Works by breaking the file up into a hash of # arrays of directives for each specified user agent, and then parsing the # directives into internal arrays and iterating through the list to find a # match. Urls are matched case sensitive, everything else is case insensitive. # The root url is treated as a special case by using a token for it. #============================================== require 'cgi' module Spider + class ExclusionParser - class ExclusionParser - attr_accessor :wait_time - DISALLOW = "disallow" - DELAY = "crawl-delay" - ALLOW = "allow" - + NULL_MATCH = '*!*'.freeze + DISALLOW = 'disallow'.freeze + DELAY = 'crawl-delay'.freeze + ALLOW = 'allow'.freeze + MAX_DIRECTIVES = 1000 - NULL_MATCH = "*!*" - - def initialize(text, agent=nil) + + def initialize(text, agent = nil, status = 200) @skip_list = [] @agent_key = agent - + return if text.nil? || text.length.zero? - - if [401, 403].include? text.http_status + + if [401, 403].include? status @skip_list << [NULL_MATCH, true] return end - + begin config = parse_text(text) grab_list(config) rescue end end - + # Check to see if the given url is matched by any rule # in the file, and return it's associated status - + def excluded?(url) url = safe_unescape(url) @skip_list.each do |entry| return entry.last if url.include? entry.first return entry.last if entry.first == NULL_MATCH end - + false end def allowed?(url) !excluded?(url) end private - + # Method to process the list of directives for a given user agent. # Picks the one that applies to us, and then processes it's directives # into the skip list by splitting the strings and taking the appropriate # action. Stops after a set number of directives to avoid malformed files # or denial of service attacks - + def grab_list(config) - section = (config.include?(@agent_key) ? - config[@agent_key] : config['*']) - - if(section.length > MAX_DIRECTIVES) + if config.include?(@agent_key) + section = config[@agent_key] + else + section = config['*'] + end + + if section.length > MAX_DIRECTIVES section.slice!(MAX_DIRECTIVES, section.length) end - + section.each do |pair| key, value = pair.split(':') - - next if key.nil? || value.nil? || - key.empty? || value.empty? - + + next if key.nil? || value.nil? || + key.empty? || value.empty? + key.downcase! key.lstrip! key.rstrip! - + value.lstrip! value.rstrip! - + disallow(value) if key == DISALLOW delay(value) if key == DELAY - allow(value) if key == ALLOW + allow(value) if key == ALLOW end end - + # Top level file parsing method - makes sure carriage returns work, # strips out any BOM, then loops through each line and opens up a new - # array of directives in the hash if a user-agent directive is found - + # array of directives in the hash if a user-agent directive is found. + def parse_text(text) - current_key = "" + current_key = '' config = {} - + text.gsub!("\r", "\n") - text.gsub!("\xEF\xBB\xBF".force_encoding("ASCII-8BIT"), '') - + text = text.force_encoding('UTF-8') + text.gsub!("\xEF\xBB\xBF".force_encoding('UTF-8'), '') + text.each_line do |line| line.lstrip! line.rstrip! - line.gsub! /#.*/, '' - - if line.length.nonzero? && line =~ /[^\s]/ - - if line =~ /User-agent:\s+(.+)/i - current_key = $1.downcase - config[current_key] = [] unless config[current_key] - next + line.gsub!(/#.*/, '') + + next unless line.length.nonzero? && line =~ /[^\s]/ + + if line =~ /User-agent:\s+(.+)/i + previous_key = current_key + current_key = $1.downcase + config[current_key] = [] unless config[current_key] + + # If we've seen a new user-agent directive and the previous one + # is empty then we have a cascading user-agent string. Copy the + # new user agent array ref so both user agents are identical. + + if config.key?(previous_key) && config[previous_key].size.zero? + config[previous_key] = config[current_key] end - + + else config[current_key] << line end end - + config - end - + end + def disallow(value) - token = (value == "/" ? NULL_MATCH : value.chomp('*')) + token = (value == '/' ? NULL_MATCH : value.chomp('*')) @skip_list << [safe_unescape(token), true] end - + def allow(value) - token = (value == "/" ? NULL_MATCH : value.chomp('*')) + token = (value == '/' ? NULL_MATCH : value.chomp('*')) @skip_list << [safe_unescape(token), false] end - + def delay(value) @wait_time = WaitTime.new(value.to_i) end - + def safe_unescape(target) - t = target.gsub /%2f/, '^^^' + t = target.gsub(/%2f/, '^^^') t = CGI.unescape(t) - t.gsub /\^\^\^/, '%2f' + t.gsub(/\^\^\^/, '%2f') end end end