sitemap.rb in w3clove-0.7.10

- old
+ new

@@ -51,11 +51,11 @@
     #
     # For HTML sources, it will only get the links that start with the sitemap root url, convert relative links
     # to absolute links, remove anchors from links, include the sitemap url, and exclude links that don't
     # seem to point to HTML (like images, multimedia, text, javascript...)
     def pages_in_sitemap
-      pages = xml_locations.map {|loc| W3Clove::Page.new(loc.text)}
+      pages = xml_locations.select {|loc| looks_like_html?(loc.text)}.map {|loc| W3Clove::Page.new(loc.text)}
       if pages.empty?
         m     = MetaInspector.new(url, timeout)
         links = [m.url]
 
         m.links.select {|l| l.start_with?(m.root_url) && looks_like_html?(l)}.map {|l| l.split('#')[0]}.uniq.each do |link|
@@ -73,13 +73,13 @@
 
     # Tells if the given url looks like an HTML page.
     # That is, it does not look like javascript, image, pdf...
     def looks_like_html?(url)
       u         = URI.parse(URI.encode(url))
-      scheme    = u.scheme
-      extension = u.path.split(".").last
+      scheme    = u.scheme                if u.scheme
+      extension = u.path.split(".").last  if u.path
 
-      (scheme =~ /http[s]?/i) && (extension !~ /gif|jpg|jpeg|png|tiff|bmp|txt|pdf|doc|xls|wav|mp3|ogg/i)
+      (scheme && extension) && (scheme =~ /http[s]?/i) && (extension !~ /gif|jpg|jpeg|png|tiff|bmp|txt|pdf|doc|rtf|xml|xls|csv|wav|mp3|ogg/i)
     end
 
     def xml_locations
       Nokogiri::XML(doc).css('loc')
     end
\ No newline at end of file