lib/w3clove/sitemap.rb in w3clove-0.7.3 vs lib/w3clove/sitemap.rb in w3clove-0.7.4
- old
+ new
@@ -47,18 +47,26 @@
# Scrapes the url in search of links.
#
# It first assumes it's an XML sitemap; if no locations found, it will try to
# scrape the links from HTML.
#
- # For HTML sources, it will only get the links that start with the sitemap url, convert relative links
+ # For HTML sources, it will only get the links that start with the sitemap root url, convert relative links
# to absolute links, remove anchors from links, include the sitemap url, and exclude links that don't
# seem to point to HTML (like images, multimedia, text, javascript...)
def pages_in_sitemap
pages = xml_locations.map {|loc| W3Clove::Page.new(loc.text)}
if pages.empty?
m = MetaInspector.new(url, timeout)
- links = m.absolute_links.select {|l| l.start_with?(m.url) && looks_like_html?(l)}.map {|l| l.split('#')[0]}.uniq
- links << m.url unless (links.include?(m.url) || links.include?("#{m.url}/"))
+ links = [m.url]
+
+ m.links.select {|l| l.start_with?(m.root_url) && looks_like_html?(l)}.map {|l| l.split('#')[0]}.uniq.each do |link|
+ if link[-1,1] == "/"
+ links << link unless (links.include?(link) || links.include?(link.chop))
+ else
+ links << link unless (links.include?(link) || links.include?("#{link}/"))
+ end
+ end
+
pages = links.map {|link| W3Clove::Page.new(link)}
end
pages
end
\ No newline at end of file