lib/w3clove/sitemap.rb in w3clove-0.6.2 vs lib/w3clove/sitemap.rb in w3clove-0.7

- old
+ new

@@ -46,19 +46,30 @@ # # It first assumes it's an XML sitemap; if no locations found, it will try to # scrape the links from HTML. # # For HTML sources, it will only get the links that start with the sitemap url, convert relative links - # to absolute links, remove anchors from links, and include the sitemap url + # to absolute links, remove anchors from links, include the sitemap url, and exclude links that don't + # seem to point to HTML (like images, multimedia, text, javascript...) def pages_in_sitemap pages = xml_locations.map {|loc| W3Clove::Page.new(loc.text)} if pages.empty? m = MetaInspector.new(url) - links = m.absolute_links.select {|l| l.start_with?(m.url)}.map {|l| l.split('#')[0]}.uniq + links = m.absolute_links.select {|l| l.start_with?(m.url) && looks_like_html?(l)}.map {|l| l.split('#')[0]}.uniq links << m.url unless (links.include?(m.url) || links.include?("#{m.url}/")) pages = links.map {|link| W3Clove::Page.new(link)} end pages + end + + # Tells if the given url looks like an HTML page. + # That is, it does not look like javascript, image, pdf... + def looks_like_html?(url) + u = URI.parse(url) + scheme = u.scheme + extension = u.path.split(".").last + + (scheme =~ /http[s]?/i) && (extension !~ /gif|jpg|jpeg|png|tiff|bmp|txt|pdf|doc|xls|wav|mp3|ogg/i) end def xml_locations Nokogiri::XML(doc).css('loc') end \ No newline at end of file