Sha256: 1d824b029e9c2d0c7c995848b7be8466e3a175a6bb7e6bbb990c9a45a89ec9a0
Contents?: true
Size: 1.1 KB
Versions: 1
Compression:
Stored size: 1.1 KB
Contents
#! /usr/bin/env ruby # == Synopsis # Crawls a site starting at the given URL, and outputs the URL of each page # in the domain as they are encountered. # # == Usage # anemone_url_list.rb [options] url # # == Options # -r, --relative Output relative URLs (rather than absolute) # # == Author # Chris Kite $:.unshift File.join(File.dirname(__FILE__), "..", "lib") require 'anemone' require 'optparse' require 'ostruct' def usage puts <<END Usage: anemone_url_list.rb [options] url Options: -r, --relative Output relative URLs (rather than absolute) END end options = OpenStruct.new options.relative = false # make sure that the last option is a URL we can crawl begin URI(ARGV.last) rescue usage Process.exit end # parse command-line options opts = OptionParser.new opts.on('-r', '--relative') { options.relative = true } opts.parse!(ARGV) puts "CODE\tFROM\tTO" Anemone.crawl(ARGV.last, :discard_page_bodies => true) do |anemone| anemone.on_every_page do |page| link = options.relative ? page.url.page : page.url puts "#{page.code}\t#{page.from_url}\t#{link}" end end
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
jeremyf-anemone-0.1.3 | bin/anemone_url_list.rb |