Sha256: e789ef1057b38b03daef5869921326a26d7a55425c6b3f040fa9cbbc55436d80

Contents?: true

Size: 1.13 KB

Versions: 1

Compression:

Stored size: 1.13 KB

Contents

#! /usr/bin/env ruby
# == Synopsis
#   Crawls a site starting at the given URL, and outputs the URL of each page
#   in the domain as they are encountered.
#
# == Usage
#   anemone_url_list.rb [options] url
#
# == Options
#   -r, --relative          Output relative URLs (rather than absolute)
#
# == Author
#   Chris Kite

$:.unshift File.join(File.dirname(__FILE__), "..", "lib")

require 'anemone'
require 'optparse'
require 'ostruct'

def usage
  puts <<END
Usage: anemone_url_list.rb [options] url
    
Options:
  -r, --relative      Output relative URLs (rather than absolute)
END
end

options = OpenStruct.new
options.relative = false

# make sure that the last option is a URL we can crawl
begin
  URI(ARGV.last)
rescue
  usage
  Process.exit 
end

# parse command-line options
opts = OptionParser.new
opts.on('-r', '--relative') { options.relative = true }
opts.parse!(ARGV)

Anemone.crawl(ARGV.last) do |anemone|  
  anemone.on_pages_like(/\/about\//, /\/experience\//) do |page|
    puts "WOOZLE #{page.url}"
  end

  anemone.on_every_page do |page|
    if options.relative
      puts page.url.path
    else
      puts page.url
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
anemone-0.0.3 bin/anemone_url_list.rb~