# -*- coding: utf-8 -*- class Parser rule robotstxt : opt_blanklines { @sitemaps = [] } body { body = val[2] result = RobotsTxt.new(@site, body, :target => @target, :sitemaps => @sitemaps) } body : | blocks opt_blanklines opt_blanklines : | blanklines blanklines : blankline | blanklines blankline blankline : eol eol : EOL { @lineno += 1 } opt_space : | SPACE opt_commentlines : | commentlines commentlines : comment | commentlines comment comment : opt_space COMMENT eol | 'sitemap' ':' opt_space VALUE eol_opt_comment { @sitemaps << val[3] } blocks : record { result = [] result << val[0] } | commentblock { result = [] } | blocks blanklines record { result << val[2] } | blocks blanklines rulelines { val[2].each_with_index { |line, i| warn "%s line %d: %s: orphan rule line" % [@site.to_s, @rulelinenos[i], line.token] if $VERBOSE } } | blocks blanklines commentblock commentblock : commentlines record : opt_commentlines agentlines opt_rulelines { result = Record.new(val[1], val[2]) } agentlines : agentline { result = [val[0]] } | agentlines agentline { result << val[1] } | agentlines comment agentline : 'user-agent' ':' opt_space VALUE eol_opt_comment { result = AgentLine.new(val[0], val[3]) } opt_rulelines : | rulelines rulelines : ruleline { result = [result] @rulelinenos = [] } | rulelines ruleline { result << val[1] @rulelinenos << @lineno } | rulelines comment ruleline : allowline | disallowline | crawldelayline | extension allowline : 'allow' ':' opt_space VALUE eol_opt_comment { result = AllowLine.new(val[0], val[3]) } disallowline : 'disallow' ':' opt_space VALUE eol_opt_comment { result = DisallowLine.new(val[0], val[3]) } crawldelayline : 'crawl-delay' ':' opt_space VALUE eol_opt_comment { result = CrawlDelayLine.new(val[0], val[3]) } extension : TOKEN ':' opt_space VALUE eol_opt_comment { result = ExtentionLine.new(val[0], val[3]) } eol_opt_comment : eol | comment ---- header require 'strscan' require 'uri' class WebRobots class Error < StandardError end class ParseError < Error end class RobotsTxt ---- inner def initialize(target = nil) super() @target = target end def parse!(input, site) parse(input, site) rescue Error => e RobotsTxt.new(site, nil, :error => e, :target => @target) end KNOWN_TOKENS = %w[User-agent Allow Disallow Crawl-delay Sitemap] RE_KNOWN_TOKENS = /#{KNOWN_TOKENS.join('|')}/i def parse(input, site) @q = [] @errors = [] @lineno = 1 @site = site string = input.respond_to?(:read) ? input.read : input s = StringScanner.new(string) value_expected = false until s.eos? if t = s.scan(/[ \t]*\r?\n/) if value_expected @q << [:VALUE, ''] end @q << [:EOL, t] value_expected = false elsif t = s.scan(/[ \t]+/) @q << [:SPACE, t] elsif t = s.scan(/:/) @q << [t, t] value_expected = true elsif t = s.scan(/#.*/) if value_expected @q << [:VALUE, ''] end @q << [:COMMENT, t] else if value_expected if t = s.scan(/.*?(?=[ \t]*(?:#|$))/) @q << [:VALUE, t] else parse_error @lineno, "unexpected characters: %s" % s.check(/.*/) end value_expected = false else if t = s.scan(RE_KNOWN_TOKENS) @q << [t.downcase, t] elsif t = s.scan(/[^\x00-\x1f\x7f()<>@,;:\\"\/\[\]?={}]+/) @q << [:TOKEN, t] else parse_error "unexpected characters: %s" % s.check(/.*/) end end end end @q << [:EOL, ''] if !@q.empty? && @q.last.first != :EOL @pos = -1 do_parse rescue Racc::ParseError => e raise ParseError, e.message end def next_token @q[@pos += 1] end def on_error(token_id, value, stack) parse_error "unexpected %s: %s" % [token_to_str(token_id), value] end def parse_error(message) message = "%s line %d: %s" % [@site.to_s, @lineno, message] if @lax @errors << message else raise Racc::ParseError, message end end ---- footer def initialize(site, records, options = nil) @timestamp = Time.now @site = site @options = options || {} @last_checked = nil @error = @options[:error] @target = @options[:target] @sitemaps = @options[:sitemaps] || [] if records && !records.empty? @records, defaults = [], [] records.each { |record| if record.default? defaults << record elsif !@target || record.match?(@target) @records << record end } @records.concat(defaults) else @records = [] end end attr_reader :timestamp, :site, :sitemaps attr_accessor :error def error! raise @error if @error end def target(user_agent = nil) if user_agent raise ArgumentError, "this instance is targeted for #{@target}" if @target user_agent else raise ArgumentError, "user_agent is mandatory for an untargeted instance" if !@target @target end end private :target def find_record(user_agent = nil) user_agent = target(user_agent) @records.find { |record| record.match?(user_agent) } end private :find_record def allow?(request_uri, user_agent = nil) record = find_record(user_agent) or return true allow = record.allow?(request_uri) if @last_checked and delay = record.delay delay -= Time.now - @last_checked sleep delay if delay > 0 end @last_checked = Time.now return allow end def options(user_agent = nil) record = find_record(user_agent) or return {} record.options end DISALLOW_ALL = <<-TXT User-Agent: * Disallow: / TXT def self.unfetchable(site, reason, target = nil) Parser.new(target).parse(DISALLOW_ALL, site).tap { |robots_txt| robots_txt.error = reason } end class Record def initialize(agentlines, rulelines) @patterns = agentlines.map { |agentline| agentline.pattern } @acls = [] @delay = nil @options = {} rulelines.each { |ruleline| case ruleline when AccessControlLine @acls << ruleline when CrawlDelayLine @delay = ruleline.delay else @options[ruleline.token.downcase] = ruleline.value end } @acls.sort! { |a, b| [ b.value.length, b.is_a?(AllowLine) ? 1 : 0 ] <=> [ a.value.length, a.is_a?(AllowLine) ? 1 : 0 ] } end attr_reader :delay, :options def match?(user_agent) @patterns.any? { |pattern| pattern.match(user_agent) } end def default? @patterns.include?(//) end def allow?(request_uri) @acls.each { |acl| if acl.match?(request_uri) return acl.allow? end } return true end end class Line def initialize(token, value) @token = token @value = value compile end attr_reader :token, :value def compile self end end class AgentLine < Line def compile if @value == '*' @pattern = // else @pattern = Regexp.new(Regexp.quote(@value), Regexp::IGNORECASE) end self end attr_reader :pattern end class AccessControlLine < Line def compile @empty = @value.empty? re_src = '\A' s = StringScanner.new(@value) until s.eos? if t = s.scan(/[^%*$]+/) re_src << Regexp.quote(t) elsif t = s.scan(/%([0-9a-f]{2})/i) c = s[1].to_i(16) if c == 0x2f re_src << '%2[fF]' else re_src << Regexp.quote('%c' % c) end elsif t = s.scan(/\*/) re_src << '.*' elsif t = s.scan(/\$/) re_src << '\z' break else re_src << Regexp.quote(s.scan(/./)) end end @pattern = Regexp.new(re_src, Regexp::MULTILINE) self end def match?(request_uri) return false if @empty transformed = request_uri.gsub(/(%2[fF])|%([0-9a-f]{2})/i) { $1 || '%c' % $2.to_i(16) } !!@pattern.match(transformed) end end class AllowLine < AccessControlLine def allow? true end end class DisallowLine < AccessControlLine def allow? false end end class CrawlDelayLine < Line def compile case @value when /\A((0|[1-9][0-9]*)\.[0-9]+)/ @delay = @value.to_f when /\A(0|[1-9][0-9]*)/ @delay = @value.to_i else @delay = nil end self end attr_reader :delay end class ExtentionLine < Line end end end