require 'rubygems' require 'imw/boot' require 'imw/utils' # The Infinite Monkeywrench (IMW) is a Ruby library for ripping, # extracting, parsing, munging, and packaging datasets. It allows you # to handle different data formats transparently as well as organize # transformations of data as a network of dependencies (a la Make or # Rake). # # IMW has a few central concepts: resources, datasets, workflows, and # repositories. # # Resources represent individual data resources like local files, # websites, databases, &c. Resources are typically instantiated via # IMW.open, with IMW doing the work of figuring out what to return # based on the URI passed in. # # Datasets represent collections of related data resources. An # IMW::Dataset comes with a pre-defined (but customizable) workflow # that takes data resources through several steps: rip, parse, munge, # and package. The workflow leverages Rake and so the various tasks # that are necessary to process the data till it is nice and pretty # can all be linked with dependencies. # # Repositories are collections of datasets and it is on these # collections that the +imw+ command line tool operates. module IMW autoload :Resource, 'imw/resource' autoload :Schemes, 'imw/schemes' autoload :Archives, 'imw/archives' autoload :CompressedFiles, 'imw/compressed_files' autoload :Formats, 'imw/formats' autoload :Tools, 'imw/tools' autoload :Parsers, 'imw/parsers' autoload :Dataset, 'imw/dataset' autoload :Repository, 'imw/repository' # Open a resource at the given +uri+. The resource will # automatically be extended by modules which make sense given the # +uri+. # # See the documentation for IMW::Resource and the various modules # within IMW::Resources for more information and options. # # Passing in an IMW::Resource will simply return it. # # @param [String, Addressable::URI, IMW::Resource] obj the URI to open # @return [IMW::Resource] the resulting resource, property extended for the given URI def self.open obj, options={} return obj if obj.is_a?(IMW::Resource) IMW::Resource.new(obj, options) end # Works the same way as IMW.open except opens the resource for # writing. # # @param [String, Addressable::URI] uri the URI to open # @return [IMW::Resource] the resultng resource, properly extended for the given URI and opened for writing. def self.open! uri, options={} IMW::Resource.new(uri, options.merge(:mode => 'w')) end # The default repository in which to place datasets. See the # documentation for IMW::Repository for more information on how # datasets and repositories fit together. # # @return [IMW::Repository] the default IMW repository def self.repository @@repository ||= IMW::Repository.new end # Create a dataset and put it in the default IMW repository. Also # yields the dataset so you can define its workflow # # IMW.dataset :my_dataset do # # # Define some paths we're going to use # add_path :raw_data, :ripd, 'raw_data.csv' # add_path :fixd_data, :fixd, 'fixed_data.csv' # # # Copy a file from a website to this dataset's +ripd+ directory. # rip do # IMW.open('http://mysite.com/data_archives/2010/03/03.csv').cp(path_to(:raw_data)) # end # # # Filter the raw data to those values which match some criterion defined by accept? # munge do # IMW.open(path_to(:raw_data)).map do |row| # row if accept?(row) # end.compact.dump(path_to(:fixd_data)) # end # # # Compress this new data # package do # IMW.open(path_to(:fixd_data)).compress.mv(path_to(:pkgd)) # end # end # # @param [Symbol, String] handle the handle to identify this dataset with # @param [Hash] options a hash of options (see IMW::Dataset) # @return [IMW::Dataset] the new dataset def self.dataset handle, options={}, &block d = IMW::Dataset.new(handle, options) d.instance_eval(&block) if block_given? d end end