# # h2. lib/imw/dataset.rb -- imw dataset # # == About # # Defines basic properties of the IMW::Dataset # # Author:: (Philip flip Kromer, Dhruv Bansal) for Infinite Monkeywrench Project (mailto:coders@infochimps.org) # Copyright:: Copyright (c) 2008 infochimps.org # License:: GPL 3.0 # Website:: http://infinitemonkeywrench.org/ # # puts "#{File.basename(__FILE__)}: You use your Monkeywrench to rake deep and straight furrows in the earth for your orchard." # at bottom require 'rake' require 'ostruct' require 'imw/utils' require 'imw/dataset/workflow' require 'imw/dataset/loaddump' require 'imw/dataset/stats' module IMW # The basic unit in IMW is the dataset. Each dataset has a handle # which is meant to be unique (at least in the context of a # particular pool of datasets, see IMW::Pool). A dataset # can also have a taxonomic classification or _taxon_ # # dataset = IMW::Dataset.new :recent_history_of_banana_prices, # :taxon => [:economics,:alarming_trends] # # but it isn't required like the handle. # # Processing a dataset commonly occurs in four course steps. IMW # defines a task[http://rake.rubyforge.org] for each of these steps # and keeps files involved in different steps in different # directories. # # rip:: # Managed by the :rip task, data is collected from a # source (+http+, +ftp+, database, &c.) and deposited in a # subdirectory of the :ripd directory named for the URI # of the source. # # dataset.task :rip do # IMW::Rip.from_web 'http://econ.chimpu.edu/datasets/produce_prices.tar.bz2' # #=> [ripd]/http/econ_chimpu_edu/datasets/produce_prices.tar.bz2 # # IMW::Rip.from_database :named => "weather_records", # :at => "public.astro.chimpu.edu", # :select => "* FROM hurricane_frequency" # #=> [ripd]/sql/_edu/chimpu_astro_public/weather_records/select_from_hurricane_frequency-2009-02-16--15:30:26.tsv # end # # Where [ripd] would be replaced by the IMW # :ripd directory. The default :rip task is # empty so If there's no need to rip data (perhaps it's already on # disk?) then nothing needs to be done here. # # raw:: # Managed by the :raw task, data is uncompressed and # extracted (if necessary) and stored in a subdirectory of the # :data directory named by the taxon and handle of this # dataset. # # dataset.task :raw do # IMW::Raw.uncompress_and_extract File.join(dataset.path_to(:ripd),'http/_edu/chimpu_econ/datasets'), # Dir[File.join(dataset.path_to(:ripd),'sql/_edu/chimpu_astro_public/**/*.tsv')].first # #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/001.xml # [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/002.xml # [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/003.xml # ... # [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/select_from_hurricane_frequency-2009-02-16--15:30:26.tsv # end # # Where [data] would be replaced by the IMW # :data directory. # # If this dataset didn't have a taxon # (economics/alarming_trends) its files would be stored in a # directory +recent_history_of_banana_prices+ just below the # :data directory. # # fix:: # Managed by the :fix task, transformations on the data # are performed. IMW's method is to read data from a source # format (XML, YAML, CSV, &c.) into Ruby objects with hash # semantics. These objects might be based upon structs, # ActiveRecord, DataMapper::Resource, FasterCSV...anything which # can be accessed as thing.property (FIXME 'and' or 'or' # ) thing[:property]: the Infinite Monkeywrench fits # neatly into your toobox. # # # # Open an output file in XML for writing # output = IMW.open! File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv') # #=> FasterCSV at [fixd]/economics/alarming_trends/recent_history_of_banana_prices/fixd/data_bananas_hurricanes.csv # # # A place to store the combined data # correlations = [] # # dataset.task :fix do # # # Return the contents of the weather data which has rows like # # # # 1 2008-09-01 4 # # 2 2008-09-08 3 # # 3 2008-08-15 3 # # ... # # # weather_data = IMW.open(Dir[File.join(dataset.path_to(:rawd), '*.tsv')].first, # :headers => ["ID","DATE","NUM_HURRICANES"]).entries # #=> [#, ... ] # # # # Return the matching data from the produce prices XML file which looks like # # # # # # # # 2008/09/01 # # 0.15 # # # # # # 2008/09/01 # # 0.20 # # # # ... # # # parser = IMW::XMLParser.new :records => [ 'prices/price[@type="banana"]', # { :week => 'date', # :price => 'amount' }] # # # Loop through the XML produce prices, mixing in the hurricane data, # # and outputting new rows. # Dir["#{dataset.path_to :rawd}*.xml"] each do |file| # IMW.open file do |xml| #=> Hpricot::Doc # parser.parse(xml).each do |record| # num_hurricanes = weather_data.(lambda { nil }) {|id,week,num_hurricanes| week == record.week} # output << [week,record[:price],num_hurricanes] # end # end # end # end # # package:: # Data is packaged and compressed (if necessary) into a delivery # format and deposited into the :pkgd directory. # # dataset.task :pkg do # IMW.open(File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv')).compress! # #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/pkgd/date_bananas_hurricanes.csv.bz2 # end # # In the above, dataset.task behaves like # Rake.task, merely defining a task and its dependencies # without executing it via # # dataset.task(:pkg).invoke # # Since the :rip, :raw, :fix, and # :pkg tasks depend upon each other, invoking :pkg # will first cause :rip to run. # # By default, the tasks associated with a dataset are blank. All of # IMW's functionality is available without defining tasks. Tasks # simply provide a convenient scaffold for building a data # transformation upon. # # Similarly, there is no requirement to use the directory structure # outlined above. IMW's methods accept plain filenames and do the # Right Thing where possible. The combination of tasks with # matching directory structure is a suggested but not mandatory # framework in which to program. class Dataset # The Rake::TaskManager module allows the # IMW::Dataset class to leverage the functionality of the # Rake[http://rake.rubyforge.org/] library to manage tasks # associated with the processing of this dataset. include Rake::TaskManager # The IMW::Workflow module contains pre-defined tasks for # dataset processing. include IMW::Workflow attr_reader :handle, :taxon, :options attr_accessor :data # The default taxon assigned to a dataset. DEFAULT_TAXON = nil # Default options passed to Rake. Any class including # the Rake::TaskManager module must define a constant by # this name. DEFAULT_OPTIONS = { :dry_run => false, :trace => false, :verbose => false } # Create a new dataset. Arguments include # # :taxon (+DEFAULT_TAXON+):: a string or sequence # giving the taxonomic classification of the dataset. See # IMW::Dataset.taxon= for more details on how this # argument is interpreted. def initialize handle, options = {} options = options.reverse_merge :taxon => DEFAULT_TAXON # FIXME is this how the attribute writer functions should be # called? @handle = handle @taxon = options[:taxon] # for rake @tasks = Hash.new @rules = Array.new @scope = Array.new @last_description = nil @options = OpenStruct.new(DEFAULT_OPTIONS) create_default_tasks # sets an empty @paths hash; see utils/paths.rb set_paths end def handle= thing @handle = thing.is_a?(String) ? thing.to_handle : thing end end end