lib/w3clove/sitemap.rb in w3clove-0.7.1 vs lib/w3clove/sitemap.rb in w3clove-0.7.2

- old
+ new

@@ -1,20 +1,22 @@ # -*- encoding: utf-8 -*- require 'open-uri' require 'nokogiri' require 'metainspector' +require 'timeout' module W3Clove ## # A sitemap has an URL, and holds a collection of pages to be validated # class Sitemap - attr_accessor :url + attr_accessor :url, :timeout - def initialize(url) - @url = url + def initialize(url, timeout = 20) + @url = url + @timeout = timeout end ## # Returns the first 250 unique URLs from the sitemap def pages @@ -51,10 +53,10 @@ # to absolute links, remove anchors from links, include the sitemap url, and exclude links that don't # seem to point to HTML (like images, multimedia, text, javascript...) def pages_in_sitemap pages = xml_locations.map {|loc| W3Clove::Page.new(loc.text)} if pages.empty? - m = MetaInspector.new(url) + m = MetaInspector.new(url, timeout) links = m.absolute_links.select {|l| l.start_with?(m.url) && looks_like_html?(l)}.map {|l| l.split('#')[0]}.uniq links << m.url unless (links.include?(m.url) || links.include?("#{m.url}/")) pages = links.map {|link| W3Clove::Page.new(link)} end pages \ No newline at end of file