Sha256: 7fa2da30e3d0038647d915e5ae27ab5d9ff536dfd33fb82a60cb988c3369a954

Contents?: true

Size: 1019 Bytes

Versions: 2

Compression:

Stored size: 1019 Bytes

Contents

#! /usr/bin/env ruby
# == Synopsis
#   Crawls a site starting at the given URL, and saves the resulting
#   PageHash object to a file using Marshal serialization.
#
# == Usage
#   anemone_serialize.rb [options] url
#
# == Options
#   -o, --output filename           Filename to save PageHash to. Defaults to crawl.{Time.now}
#
# == Author
#   Chris Kite

$:.unshift File.join(File.dirname(__FILE__), "..", "lib")

require 'anemone'
require 'optparse'
require 'rdoc/usage'
require 'ostruct'

# make sure that the first option is a URL we can crawl
begin
  URI(ARGV[0])
rescue
  RDoc::usage()
  Process.exit 
end

options = OpenStruct.new
options.output_file = "crawl.#{Time.now.to_i}"

# parse command-line options
opts = OptionParser.new
opts.on('-o', '--output filename') {|o| options.output_file = o }
opts.parse!(ARGV)

root = ARGV[0]
Anemone.crawl(root) do |anemone|
  anemone.after_crawl do |pages|
    open(options.output_file, 'w') {|f| Marshal.dump(pages, f)}
  end
end

Version data entries

2 entries across 2 versions & 1 rubygems

Version Path
anemone-0.0.1 bin/anemone_serialize.rb
anemone-0.0.2 bin/anemone_serialize.rb