lib/retriever/fetchsitemap.rb in rubyretriever-1.0.3 vs lib/retriever/fetchsitemap.rb in rubyretriever-1.1.0

- old
+ new

@@ -1,36 +1,41 @@ module Retriever - class FetchSitemap < Fetch - def initialize(url,options) #recieves target URL and RR options, returns an array of all unique pages found on the site - super - @data = [@t.target] - page_one = Retriever::Page.new(@t.source,@t) - @linkStack = page_one.parseInternalVisitable - lg("URL Crawled: #{@t.target}") - lg("#{@linkStack.size-1} new links found") - errlog("Bad URL -- #{@t.target}") if !@linkStack + # + class FetchSitemap < Fetch + # recieves target URL and RR options + # returns an array of all unique pages found on the site + def initialize(url, options) + super + @data = [@t.target] + page_one = Retriever::Page.new(@t.source, @t) + lg("URL Crawled: #{@t.target}") + @link_stack = page_one.parse_internal_visitable + errlog("Bad URL -- #{@t.target}") unless @link_stack + lg("#{@link_stack.size - 1} links found") - @linkStack.delete(@t.target) if @linkStack.include?(@t.target) - @linkStack = @linkStack.take(@maxPages) if (@linkStack.size+1 > @maxPages) - @data.concat(@linkStack) + @link_stack.delete(@t.target) + @data.concat(@link_stack) - self.async_crawl_and_collect() + async_crawl_and_collect - @data.sort_by! {|x| x.length} if @data.size>1 - @data.uniq! - end - def gen_xml #produces valid XML sitemap based on page collection fetched. Writes to current directory. - f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+') - f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>" - @data.each do |url| - f << "<url><loc>#{url}</loc></url>" - end - f << "</urlset>" - f.close - puts "###############################" - puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml" - puts "Object Count: #{@data.size}" - puts "###############################" - puts - end - end -end \ No newline at end of file + @data.sort_by! { |x| x.length } if @data.size > 1 + @data.uniq! + end + + # produces valid XML sitemap based on page collection fetched. + # Writes to current directory. + def gen_xml + f = File.open("sitemap-#{@t.host.split('.')[1]}.xml", 'w+') + f << "<?xml version='1.0' encoding='UTF-8'?><urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>" + @data.each do |url| + f << "<url><loc>#{url}</loc></url>" + end + f << '</urlset>' + f.close + puts '###############################' + puts "File Created: sitemap-#{@t.host.split('.')[1]}.xml" + puts "Object Count: #{@data.size}" + puts '###############################' + puts + end + end +end