# -*- coding: utf-8 -*- require 'helper' class TestWebRobots < Test::Unit::TestCase context "robots.txt with no rules" do setup do @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri| case uri.to_s when 'http://site1.example.org/robots.txt' <<-'TXT' TXT when 'http://site2.example.org/robots.txt' <<-'TXT' TXT when 'http://site3.example.org/robots.txt' <<-'TXT' #comment TXT when 'http://site4.example.org/robots.txt' <<-'TXT' #comment TXT else raise "#{uri} is not supposed to be fetched" end }) end should "allow any robot" do assert @robots.allowed?('http://site1.example.org/index.html') assert @robots.allowed?('http://site1.example.org/private/secret.txt') assert @robots.allowed?('http://site2.example.org/index.html') assert @robots.allowed?('http://site2.example.org/private/secret.txt') assert @robots.allowed?('http://site3.example.org/index.html') assert @robots.allowed?('http://site3.example.org/private/secret.txt') assert @robots.allowed?('http://site4.example.org/index.html') assert @robots.allowed?('http://site4.example.org/private/secret.txt') end end context "robots.txt that cannot be fetched" do setup do @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri| case uri.to_s when 'http://site1.example.org/robots.txt' raise Net::HTTPFatalError.new( 'Internal Server Error', Net::HTTPInternalServerError.new('1.1', '500', 'Internal Server Error')) when 'http://site2.example.org/robots.txt' raise Net::HTTPRetriableError.new( 'Found', Net::HTTPFound.new('1.1', '302', 'Found')) when 'http://site3.example.org/robots.txt' raise Errno::ECONNREFUSED when 'http://site4.example.org/robots.txt' raise SocketError, "getaddrinfo: nodename nor servname provided, or not known" when 'http://site5.example.org/robots.txt' nil else raise "#{uri} is not supposed to be fetched" end }) end should "disallow any robot" do assert @robots.disallowed?('http://site1.example.org/index.html') assert @robots.disallowed?('http://site1.example.org/private/secret.txt') assert @robots.disallowed?('http://site2.example.org/index.html') assert @robots.disallowed?('http://site2.example.org/private/secret.txt') assert @robots.disallowed?('http://site3.example.org/index.html') assert @robots.disallowed?('http://site3.example.org/private/secret.txt') assert @robots.disallowed?('http://site4.example.org/index.html') assert @robots.disallowed?('http://site4.example.org/private/secret.txt') assert @robots.disallowed?('http://site5.example.org/index.html') assert @robots.disallowed?('http://site5.example.org/private/secret.txt') end end context "robots.txt with some rules" do setup do http_get = lambda { |uri| case uri.to_s when 'http://www.example.org/robots.txt' <<-'TXT' # Punish evil bots User-Agent: evil Disallow: / Disallow-Not: / # parser teaser User-Agent: good # Be generous to good bots Disallow: /2heavy/ Allow: /2heavy/*.htm Disallow: /2heavy/*.htm$ User-Agent: * Disallow: /2heavy/ Disallow: /index.html # Allow takes precedence over Disallow if the pattern lengths are the same. Allow: /index.html TXT when 'http://www.example.com/robots.txt' <<-'TXT' # Default rule is evaluated last even if it is put first. User-Agent: * Disallow: /2heavy/ Disallow: /index.html # Allow takes precedence over Disallow if the pattern lengths are the same. Allow: /index.html # Punish evil bots User-Agent: evil Disallow: / User-Agent: good # Be generous to good bots Disallow: /2heavy/ Allow: /2heavy/*.htm Disallow: /2heavy/*.htm$ TXT when 'http://koster1.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /tmp TXT when 'http://koster2.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /tmp/ TXT when 'http://koster3.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /a%3cd.html TXT when 'http://koster4.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /a%3Cd.html TXT when 'http://koster5.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /a%2fb.html TXT when 'http://koster6.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /a/b.html TXT when 'http://koster7.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /%7ejoe/index.html TXT when 'http://koster8.example.net/robots.txt' <<-'TXT' User-Agent: * Disallow: /~joe/index.html TXT else raise "#{uri} is not supposed to be fetched" end } @robots = WebRobots.new('RandomBot', :http_get => http_get) @robots_good = WebRobots.new('GoodBot', :http_get => http_get) @robots_evil = WebRobots.new('EvilBot', :http_get => http_get) end should "properly restrict access" do assert_nothing_raised { assert @robots_good.allowed?('http://www.example.org/index.html') } assert !@robots_good.allowed?('http://www.example.org/2heavy/index.php') assert @robots_good.allowed?('http://www.example.org/2HEAVY/index.php') assert !@robots_good.allowed?(URI('http://www.example.org/2heavy/index.php')) assert @robots_good.allowed?('http://www.example.org/2heavy/index.html') assert @robots_good.allowed?('http://WWW.Example.Org/2heavy/index.html') assert !@robots_good.allowed?('http://www.example.org/2heavy/index.htm') assert !@robots_good.allowed?('http://WWW.Example.Org/2heavy/index.htm') assert !@robots_evil.allowed?('http://www.example.org/index.html') assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.php') assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.html') assert !@robots_evil.allowed?('http://www.example.org/2heavy/index.htm') assert @robots.allowed?('http://www.example.org/index.html') assert !@robots.allowed?('http://www.example.org/2heavy/index.php') assert !@robots.allowed?('http://www.example.org/2heavy/index.html') assert !@robots.allowed?('http://www.example.org/2heavy/index.htm') assert @robots_good.allowed?('http://www.example.com/index.html') assert !@robots_good.allowed?('http://www.example.com/2heavy/index.php') assert @robots_good.allowed?('http://www.example.com/2heavy/index.html') assert !@robots_good.allowed?('http://www.example.com/2heavy/index.htm') assert !@robots_evil.allowed?('http://www.example.com/index.html') assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.php') assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.html') assert !@robots_evil.allowed?('http://www.example.com/2heavy/index.htm') assert @robots.allowed?('http://www.example.com/index.html') assert !@robots.allowed?('http://www.example.com/2heavy/index.php') assert !@robots.allowed?('http://www.example.com/2heavy/index.html') assert !@robots.allowed?('http://www.example.com/2heavy/index.htm') end should "follow what is said in Koster's draft" do assert @robots.disallowed?('http://koster1.example.net/tmp') assert @robots.disallowed?('http://koster1.example.net/tmp.html') assert @robots.disallowed?('http://koster1.example.net/tmp/a.html') assert !@robots.disallowed?('http://koster2.example.net/tmp') assert @robots.disallowed?('http://koster2.example.net/tmp/') assert @robots.disallowed?('http://koster2.example.net/tmp/a.html') assert @robots.disallowed?('http://koster3.example.net/a%3cd.html') assert @robots.disallowed?('http://koster3.example.net/a%3Cd.html') assert @robots.disallowed?('http://koster4.example.net/a%3cd.html') assert @robots.disallowed?('http://koster4.example.net/a%3Cd.html') assert @robots.disallowed?('http://koster5.example.net/a%2fb.html') assert !@robots.disallowed?('http://koster5.example.net/a/b.html') assert !@robots.disallowed?('http://koster6.example.net/a%2fb.html') assert @robots.disallowed?('http://koster6.example.net/a/b.html') assert @robots.disallowed?('http://koster7.example.net/~joe/index.html') assert @robots.disallowed?('http://koster8.example.net/%7Ejoe/index.html') end end context "robots.txt with errors" do setup do @turn1 = @turn2 = 0 @http_get = lambda { |uri| case uri.to_s when 'http://www.example.org/robots.txt' if (@turn1 += 1) % 2 == 1 <<-'TXT' # some comment User-Agent: thebot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html User-Agent: anotherbot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html TXT else <<-'TXT' # some comment User-Agent: thebot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html # User-Agent: anotherbot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html TXT end when 'http://www.example.com/robots.txt' if (@turn2 += 1) % 2 == 1 <<-'TXT' # some comment #User-Agent: thebot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html User-Agent: anotherbot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html TXT else <<-'TXT' # some comment User-Agent: thebot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html User-Agent: anotherbot # Disallow: / Disallow: /2heavy/ # Allow: /2heavy/notsoheavy Allow: /2heavy/*.html TXT end else raise "#{uri} is not supposed to be fetched" end } end should "raise ParseError" do robots = WebRobots.new('TheBot', :http_get => @http_get) url = 'http://www.example.org/2heavy/index.php' assert_nil robots.error(url) assert !robots.allowed?(url) assert_nothing_raised { robots.error!(url) } robots.reset(url) assert robots.allowed?(url) error = robots.error(url) assert_instance_of WebRobots::ParseError, error assert_equal URI('http://www.example.org/'), error.site assert_raise(WebRobots::ParseError) { robots.error!(url) } robots.reset(url) assert_nil robots.error(url) assert !robots.allowed?(url) assert_nothing_raised { robots.error!(url) } url = 'http://www.example.com/2heavy/index.php' assert robots.allowed?(url) assert_instance_of WebRobots::ParseError, robots.error(url) assert_raise(WebRobots::ParseError) { robots.error!(url) } robots.reset(url) assert_nil robots.error(url) assert !robots.allowed?(url) assert_nothing_raised { robots.error!(url) } robots.reset(url) assert robots.allowed?(url) assert_instance_of WebRobots::ParseError, robots.error(url) assert_raise(WebRobots::ParseError) { robots.error!(url) } end end context "robots.txt with options" do setup do http_get = lambda { |uri| case uri.to_s when 'http://www.example.org/robots.txt' <<-'TXT' Sitemap: http://www.example.org/sitemap-host1.xml Sitemap: http://www.example.org/sitemap-host2.xml User-Agent: MyBot Disallow: /2heavy/ Allow: /2heavy/*.html Option1: Foo Option2: Hello Crawl-Delay: 1.5 User-Agent: * Disallow: /2heavy/ Allow: /2heavy/*.html # These are wrong but should be allowed Allow: /2heavy/% Crawl-Delay: # Option1: Bar Option3: Hi TXT else raise "#{uri} is not supposed to be fetched" end } @robots_mybot = WebRobots.new('MyBot', :http_get => http_get) @robots_hisbot = WebRobots.new('HisBot', :http_get => http_get) end should "read options" do options = @robots_mybot.options('http://www.example.org/') assert_equal 2, options.size assert_equal 'Foo', @robots_mybot.option('http://www.example.org/', 'Option1') assert_equal 'Foo', options['option1'] assert_equal 'Hello', @robots_mybot.option('http://www.example.org/', 'Option2') assert_equal 'Hello', options['option2'] options = @robots_hisbot.options('http://www.example.org/') assert_equal 2, options.size assert_equal 'Bar', @robots_hisbot.option('http://www.example.org/', 'Option1') assert_equal 'Bar', options['option1'] assert_equal 'Hi', @robots_hisbot.option('http://www.example.org/', 'Option3') assert_equal 'Hi', options['option3'] assert_equal %w[ http://www.example.org/sitemap-host1.xml http://www.example.org/sitemap-host2.xml ], @robots_mybot.sitemaps('http://www.example.org/') assert_equal %w[ http://www.example.org/sitemap-host1.xml http://www.example.org/sitemap-host2.xml ], @robots_hisbot.sitemaps('http://www.example.org/') t1 = Time.now @robots_mybot.allowed?('http://www.example.org/') @robots_mybot.allowed?('http://www.example.org/article1.html') t2 = Time.now assert_in_delta 1.5, t2 - t1, 0.1 @robots_mybot.allowed?('http://www.example.org/article2.html') t3 = Time.now assert_in_delta 1.5, t3 - t2, 0.1 end end context "robots.txt with options" do setup do http_get = lambda { |uri| case uri.to_s when 'http://www.example.org/robots.txt' <<-'TXT' User-Agent: * Disallow: / TXT else raise "#{uri} is not supposed to be fetched" end } @robots = WebRobots.new('RandomBot', :http_get => http_get) end should "validate URI" do assert_raise(ArgumentError) { @robots.allowed?('www.example.org/') } assert_raise(ArgumentError) { @robots.allowed?('::/home/knu') } end end context "robots.txt in the real world" do setup do @testbot = WebRobots.new('TestBot') @msnbot = WebRobots.new('TestMSNBot') # matches msnbot end should "be parsed for major sites" do assert_nothing_raised { assert !@testbot.allowed?("http://www.google.com/search") assert !@testbot.allowed?("https://www.google.com/search") assert !@testbot.allowed?("http://www.google.com/news/section?pz=1&cf=all&ned=jp&topic=y&ict=ln") assert @testbot.allowed?("http://www.google.com/news/directory?pz=1&cf=all&ned=us&hl=en&sort=users&category=6") } assert_nothing_raised { assert @testbot.allowed?("http://www.yahoo.com/") assert !@testbot.allowed?("http://www.yahoo.com/?") assert !@testbot.allowed?("http://www.yahoo.com/p/foo") } assert_nothing_raised { assert !@testbot.allowed?("http://store.apple.com/vieworder") assert @msnbot.allowed?("http://store.apple.com/vieworder") } assert_nothing_raised { assert !@testbot.allowed?("http://github.com/login") } end end context "meta robots tag" do setup do @doc = Nokogiri::HTML(<<-HTML) test HTML end should "be properly parsed when given in HTML string" do assert !@doc.noindex? assert @doc.nofollow? assert @doc.noindex?('slurp') assert @doc.nofollow?('slurp') assert @doc.noindex?('googlebot') assert !@doc.nofollow?('googlebot') assert @doc.meta_robots('googlebot').include?('noarchive') end end class Agent def initialize @robots = WebRobots.new 'agent', :http_get => method(:get) end def get uri @robots.allowed? uri if uri.request_uri == '/robots.txt' then '' else 'content' end end end context "embedded in a user-agent" do setup do @agent = Agent.new end should "fetch robots.txt" do body = @agent.get URI.parse 'http://example/robots.html' assert_equal 'content', body end end context "robots.txt with a space at the end of the last line" do setup do @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri| res = case uri.to_s when 'http://site1.example.com/robots.txt' <<-'TXT' User-agent: * Request-rate: 1/30 Disallow: /util/ Sitemap: http://site1.example.com/text/sitemap.xml TXT when 'http://site2.example.com/robots.txt' <<-'TXT' User-agent: * Request-rate: 1/30 Disallow: /util/ Sitemap: http://site2.example.com/text/sitemap.xml TXT else raise "#{uri} is not supposed to be fetched" end # This chomp is actually key to the test. Remove the final EOL. # The final line should be the one ending with the space. res.chomp }) end should "be properly parsed" do assert_equal(["http://site1.example.com/text/sitemap.xml"], @robots.sitemaps("http://site1.example.com/")) assert_equal(["http://site2.example.com/text/sitemap.xml"], @robots.sitemaps("http://site2.example.com/")) end end context "robots.txt cache" do setup do @fetched = false @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri| case uri.to_s when 'http://site1.example.org/robots.txt' @fetched = true <<-'TXT' User-Agent: * Disallow: /foo TXT when 'http://site2.example.org/robots.txt' @fetched = true nil end }) end should "persist unless cache is cleared" do assert !@fetched assert !@robots.allowed?('http://site1.example.org/foo') assert @fetched @fetched = false assert @robots.allowed?('http://site1.example.org/bar') assert !@fetched assert @robots.allowed?('http://site1.example.org/baz') assert !@fetched assert !@robots.allowed?('http://site1.example.org/foo') assert !@fetched @robots.flush_cache assert !@fetched assert !@robots.allowed?('http://site1.example.org/foo') assert @fetched @fetched = false assert @robots.allowed?('http://site1.example.org/bar') assert !@fetched assert @robots.allowed?('http://site1.example.org/baz') assert !@fetched assert !@robots.allowed?('http://site1.example.org/foo') assert !@fetched end should "persist for non-existent robots.txt unless cache is cleared" do assert !@fetched assert !@robots.allowed?('http://site2.example.org/foo') assert @fetched @fetched = false assert !@robots.allowed?('http://site2.example.org/bar') assert !@fetched assert !@robots.allowed?('http://site2.example.org/baz') assert !@fetched assert !@robots.allowed?('http://site2.example.org/foo') assert !@fetched @robots.flush_cache assert !@fetched assert !@robots.allowed?('http://site2.example.org/foo') assert @fetched @fetched = false assert !@robots.allowed?('http://site2.example.org/bar') assert !@fetched assert !@robots.allowed?('http://site2.example.org/baz') assert !@fetched assert !@robots.allowed?('http://site2.example.org/foo') assert !@fetched end end context "robots.txt with just user-agent & sitemap and no blank line between them" do setup do @robots = WebRobots.new('RandomBot', :http_get => lambda { |uri| res = case uri.to_s when 'http://site1.example.com/robots.txt' <<-'TXT' User-agent: * Sitemap: http://site1.example.com/text/sitemap.xml TXT else raise "#{uri} is not supposed to be fetched" end }) end should "be properly parsed" do assert @robots.allowed?("http://site1.example.com/foo") assert_equal(["http://site1.example.com/text/sitemap.xml"], @robots.sitemaps("http://site1.example.com/")) end end end