Sha256: afbc824c0088aebbd764e1affe6875ccbe50451f6d95d3fb6f0f6d788f6af34c

Contents?: true

Size: 895 Bytes

Versions: 2

Compression:

Stored size: 895 Bytes

Contents

#! /usr/bin/env ruby
# == Synopsis
#   Crawls a site starting at the given URL, and outputs a count of
#   the number of Pages at each depth in the site.
#
# == Usage
#   anemone_pagedepth.rb url
#
# == Author
#   Chris Kite

$:.unshift File.join(File.dirname(__FILE__), "..", "lib")

require 'anemone'
require 'rdoc/usage'

# make sure that the first option is a URL we can crawl
begin
  URI(ARGV[0])
rescue
  RDoc::usage()
  Process.exit 
end

root = ARGV[0]
Anemone.crawl(root) do |anemone|
  anemone.skip_links_like %r{^/c/$}, %r{^/stores/$}
  
  anemone.after_crawl do |pages|
    pages = pages.shortest_paths!(root).uniq
    depths = pages.values.inject({}) do |depths, page|
      depths[page.depth] ||= 0
      depths[page.depth] += 1
      depths
    end
    
    depths.sort.each { |depth, count| puts "Depth: #{depth} Count: #{count}" }
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
anemone-0.0.1 bin/anemone_pagedepth.rb
anemone-0.0.2 bin/anemone_pagedepth.rb