Sha256: 7a2270934c38bbcba4a73e132e594ed646dc19d52e8638fe0c4c31dc9444f90f

Contents?: true

Size: 1.09 KB

Versions: 14

Compression:

Stored size: 1.09 KB

Contents

#! /usr/bin/env ruby
# == Synopsis
#   Crawls a site starting at the given URL, and saves the resulting
#   PageHash object to a file using Marshal serialization.
#
# == Usage
#   anemone_serialize.rb [options] url
#
# == Options
#   -o, --output filename           Filename to save PageHash to. Defaults to crawl.{Time.now}
#
# == Author
#   Chris Kite

$:.unshift File.join(File.dirname(__FILE__), "..", "lib")

require 'anemone'
require 'optparse'
require 'ostruct'

def usage
  puts <<END
Usage: anemone_serialize.rb [options] url

Options:
  -o, --output filename      Filename to save PageHash to. Defaults to crawl.{Time.now}
END
end

# make sure that the first option is a URL we can crawl
begin
  URI(ARGV[0])
rescue
  usage
  Process.exit 
end

options = OpenStruct.new
options.output_file = "crawl.#{Time.now.to_i}"

# parse command-line options
opts = OptionParser.new
opts.on('-o', '--output filename') {|o| options.output_file = o }
opts.parse!(ARGV)

root = ARGV[0]
Anemone.crawl(root) do |anemone|
  anemone.after_crawl do |pages|
    open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
  end
end

Version data entries

14 entries across 14 versions & 4 rubygems

Version Path
chriskite-anemone-0.0.4 bin/anemone_serialize.rb
chriskite-anemone-0.0.5 bin/anemone_serialize.rb
chriskite-anemone-0.0.6 bin/anemone_serialize.rb
chriskite-anemone-0.1.0 bin/anemone_serialize.rb
jeremyf-anemone-0.1.3 bin/anemone_serialize.rb
parolkar-anemone-0.1.2 bin/anemone_serialize.rb
anemone-0.2.0 bin/anemone_serialize.rb
anemone-0.1.2 bin/anemone_serialize.rb
anemone-0.0.5 bin/anemone_serialize.rb
anemone-0.0.6 bin/anemone_serialize.rb
anemone-0.1.0 bin/anemone_serialize.rb
anemone-0.1.1 bin/anemone_serialize.rb
anemone-0.0.4 bin/anemone_serialize.rb
anemone-0.0.3 bin/anemone_serialize.rb