lib/w3clove/sitemap.rb in w3clove-0.7.1 vs lib/w3clove/sitemap.rb in w3clove-0.7.2
- old
+ new
@@ -1,20 +1,22 @@
# -*- encoding: utf-8 -*-
require 'open-uri'
require 'nokogiri'
require 'metainspector'
+require 'timeout'
module W3Clove
##
# A sitemap has an URL, and holds a collection of pages to be validated
#
class Sitemap
- attr_accessor :url
+ attr_accessor :url, :timeout
- def initialize(url)
- @url = url
+ def initialize(url, timeout = 20)
+ @url = url
+ @timeout = timeout
end
##
# Returns the first 250 unique URLs from the sitemap
def pages
@@ -51,10 +53,10 @@
# to absolute links, remove anchors from links, include the sitemap url, and exclude links that don't
# seem to point to HTML (like images, multimedia, text, javascript...)
def pages_in_sitemap
pages = xml_locations.map {|loc| W3Clove::Page.new(loc.text)}
if pages.empty?
- m = MetaInspector.new(url)
+ m = MetaInspector.new(url, timeout)
links = m.absolute_links.select {|l| l.start_with?(m.url) && looks_like_html?(l)}.map {|l| l.split('#')[0]}.uniq
links << m.url unless (links.include?(m.url) || links.include?("#{m.url}/"))
pages = links.map {|link| W3Clove::Page.new(link)}
end
pages
\ No newline at end of file