sitemap.rb in w3clove-0.7

- old
+ new
@@ -46,19 +46,30 @@
     #
     # It first assumes it's an XML sitemap; if no locations found, it will try to
     # scrape the links from HTML.
     #
     # For HTML sources, it will only get the links that start with the sitemap url, convert relative links
-    # to absolute links, remove anchors from links, and include the sitemap url
+    # to absolute links, remove anchors from links, include the sitemap url, and exclude links that don't
+    # seem to point to HTML (like images, multimedia, text, javascript...)
     def pages_in_sitemap
       pages = xml_locations.map {|loc| W3Clove::Page.new(loc.text)}
       if pages.empty?
         m     = MetaInspector.new(url)
-        links = m.absolute_links.select {|l| l.start_with?(m.url)}.map {|l| l.split('#')[0]}.uniq
+        links = m.absolute_links.select {|l| l.start_with?(m.url) && looks_like_html?(l)}.map {|l| l.split('#')[0]}.uniq
         links << m.url unless (links.include?(m.url) || links.include?("#{m.url}/"))
         pages = links.map {|link| W3Clove::Page.new(link)}
       end
       pages
+    end
+
+    # Tells if the given url looks like an HTML page.
+    # That is, it does not look like javascript, image, pdf...
+    def looks_like_html?(url)
+      u         = URI.parse(url)
+      scheme    = u.scheme
+      extension = u.path.split(".").last
+
+      (scheme =~ /http[s]?/i) && (extension !~ /gif|jpg|jpeg|png|tiff|bmp|txt|pdf|doc|xls|wav|mp3|ogg/i)
     end
 
     def xml_locations
       Nokogiri::XML(doc).css('loc')
     end
\ No newline at end of file