lib/w3clove/sitemap.rb in w3clove-0.3.5 vs lib/w3clove/sitemap.rb in w3clove-0.4.0

- old
+ new

@@ -1,9 +1,10 @@ # -*- encoding: utf-8 -*- require 'open-uri' require 'nokogiri' +require 'metainspector' module W3Clove ## # A sitemap has an URL, and holds a collection of pages to be validated # @@ -39,14 +40,25 @@ binding end private + # Scrapes the url in search of links. + # It first assumes it's an XML sitemap; if no locations found, it will try to + # scrape the links from HTML. + # For HTML sources, it will only get the links that start with the sitemap url, convert relative links + # to absolute links, and remove anchors from links def pages_in_sitemap - locations.map {|loc| W3Clove::Page.new(loc.text)} + pages = xml_locations.map {|loc| W3Clove::Page.new(loc.text)} + if pages.empty? + m = MetaInspector.new(url) + links = m.absolute_links.select {|l| l.start_with?(m.url)}.map {|l| l.split('#')[0]}.uniq + pages = links.map {|link| W3Clove::Page.new(link)} + end + pages end - def locations + def xml_locations Nokogiri::XML(doc).css('loc') end def doc @doc ||= open(url) \ No newline at end of file