bin/anemone_cron.rb in anemone-0.0.2 vs bin/anemone_cron.rb in anemone-0.0.3
- old
+ new
@@ -1,99 +1,108 @@
-#! /usr/bin/env ruby
-# == Synopsis
-# Performs pagedepth, url list, and count functionality
-# Meant to be run daily as a cron job
-#
-# == Usage
-# anemone_url_list.rb [options] url
-#
-# == Options
-# -r, --relative Output relative URLs (rather than absolute)
-# -o, --output filename Filename to save URL list to. Defaults to urls.txt.
-#
-# == Author
-# Chris Kite
-
-$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
-
-require 'anemone'
-require 'optparse'
-require 'rdoc/usage'
-require 'ostruct'
-
-options = OpenStruct.new
-options.relative = false
-options.output_file = 'urls.txt'
-
-# make sure that the last option is a URL we can crawl
-begin
- URI(ARGV.last)
-rescue
- RDoc::usage()
- Process.exit
-end
-
-# parse command-line options
-opts = OptionParser.new
-opts.on('-r', '--relative') { options.relative = true }
-opts.on('-o', '--output filename') {|o| options.output_file = o }
-opts.parse!(ARGV)
-
-root = ARGV.last
-
-Anemone.crawl(root) do |anemone|
-
- anemone.after_crawl do |pages|
- puts "Crawl results for #{root}\n"
-
- # print a list of 404's
- not_found = []
- pages.each_value do |page|
- url = page.url.to_s
- not_found << url if page.not_found?
- end
- if !not_found.empty?
- puts "\n404's:"
- not_found.each do |url|
- if options.relative
- puts URI(url).path.to_s
- else
- puts url
- end
- num_linked_from = 0
- pages.urls_linking_to(url).each do |u|
- u = u.path if options.relative
- num_linked_from += 1
- puts " linked from #{u}"
- if num_linked_from > 10
- puts " ..."
- break
- end
- end
- end
-
- print "\n"
- end
-
- # remove redirect aliases, and calculate pagedepths
- pages = pages.shortest_paths!(root).uniq
- depths = pages.values.inject({}) do |depths, page|
- depths[page.depth] ||= 0
- depths[page.depth] += 1
- depths
- end
-
- # print the page count
- puts "Total pages: #{pages.size}\n"
-
- # print a list of depths
- depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
-
- # output a list of urls to file
- file = open(options.output_file, 'w')
- pages.each_key do |url|
- url = options.relative ? url.path.to_s : url.to_s
- file.puts url
- end
-
- end
+#! /usr/bin/env ruby
+# == Synopsis
+# Performs pagedepth, url list, and count functionality
+# Meant to be run daily as a cron job
+#
+# == Usage
+# anemone_url_list.rb [options] url
+#
+# == Options
+# -r, --relative Output relative URLs (rather than absolute)
+# -o, --output filename Filename to save URL list to. Defaults to urls.txt.
+#
+# == Author
+# Chris Kite
+
+$:.unshift File.join(File.dirname(__FILE__), "..", "lib")
+
+require 'anemone'
+require 'optparse'
+require 'ostruct'
+
+def usage
+ puts <<END
+Usage: anemone_url_list.rb [options] url
+
+Options:
+ -r, --relative Output relative URLs (rather than absolute)
+ -o, --output filename Filename to save URL list to. Defautls to urls.txt.
+END
+end
+
+options = OpenStruct.new
+options.relative = false
+options.output_file = 'urls.txt'
+
+# make sure that the last option is a URL we can crawl
+begin
+ URI(ARGV.last)
+rescue
+ usage
+ Process.exit
+end
+
+# parse command-line options
+opts = OptionParser.new
+opts.on('-r', '--relative') { options.relative = true }
+opts.on('-o', '--output filename') {|o| options.output_file = o }
+opts.parse!(ARGV)
+
+root = ARGV.last
+
+Anemone.crawl(root, {:discard_page_bodies => true}) do |anemone|
+
+ anemone.after_crawl do |pages|
+ puts "Crawl results for #{root}\n"
+
+ # print a list of 404's
+ not_found = []
+ pages.each_value do |page|
+ url = page.url.to_s
+ not_found << url if page.not_found?
+ end
+ if !not_found.empty?
+ puts "\n404's:"
+ not_found.each do |url|
+ if options.relative
+ puts URI(url).path.to_s
+ else
+ puts url
+ end
+ num_linked_from = 0
+ pages.urls_linking_to(url).each do |u|
+ u = u.path if options.relative
+ num_linked_from += 1
+ puts " linked from #{u}"
+ if num_linked_from > 10
+ puts " ..."
+ break
+ end
+ end
+ end
+
+ print "\n"
+ end
+
+ # remove redirect aliases, and calculate pagedepths
+ pages = pages.shortest_paths!(root).uniq
+ depths = pages.values.inject({}) do |depths, page|
+ depths[page.depth] ||= 0
+ depths[page.depth] += 1
+ depths
+ end
+
+ # print the page count
+ puts "Total pages: #{pages.size}\n"
+
+ # print a list of depths
+ depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
+
+ # output a list of urls to file
+ file = open(options.output_file, 'w')
+ pages.each_key do |url|
+ url = options.relative ? url.path.to_s : url.to_s
+ file.puts url
+ end
+
+ end
end
\ No newline at end of file