lib/w3clove/sitemap.rb in w3clove-0.6.2 vs lib/w3clove/sitemap.rb in w3clove-0.7
- old
+ new
@@ -46,19 +46,30 @@
#
# It first assumes it's an XML sitemap; if no locations found, it will try to
# scrape the links from HTML.
#
# For HTML sources, it will only get the links that start with the sitemap url, convert relative links
- # to absolute links, remove anchors from links, and include the sitemap url
+ # to absolute links, remove anchors from links, include the sitemap url, and exclude links that don't
+ # seem to point to HTML (like images, multimedia, text, javascript...)
def pages_in_sitemap
pages = xml_locations.map {|loc| W3Clove::Page.new(loc.text)}
if pages.empty?
m = MetaInspector.new(url)
- links = m.absolute_links.select {|l| l.start_with?(m.url)}.map {|l| l.split('#')[0]}.uniq
+ links = m.absolute_links.select {|l| l.start_with?(m.url) && looks_like_html?(l)}.map {|l| l.split('#')[0]}.uniq
links << m.url unless (links.include?(m.url) || links.include?("#{m.url}/"))
pages = links.map {|link| W3Clove::Page.new(link)}
end
pages
+ end
+
+ # Tells if the given url looks like an HTML page.
+ # That is, it does not look like javascript, image, pdf...
+ def looks_like_html?(url)
+ u = URI.parse(url)
+ scheme = u.scheme
+ extension = u.path.split(".").last
+
+ (scheme =~ /http[s]?/i) && (extension !~ /gif|jpg|jpeg|png|tiff|bmp|txt|pdf|doc|xls|wav|mp3|ogg/i)
end
def xml_locations
Nokogiri::XML(doc).css('loc')
end
\ No newline at end of file