lib/imw/dataset.rb in imw-0.1.1 vs lib/imw/dataset.rb in imw-0.2.0

- old
+ new

@@ -1,207 +1,115 @@ -require 'imw/utils' require 'imw/dataset/workflow' require 'imw/dataset/paths' module IMW - # The IMW::Dataset class is useful organizing a complex data - # transformation because it is capable of managing a collection of - # paths and the interdependencies between subparts of the - # transformation. + # The IMW::Dataset represents a common object in which paths, data + # resources, and various tasks can be intermingled to define a + # complex transformation of data. # - # == Manipulating Paths + # == Organizing Paths # - # Storing paths makes code shorter and more readable. By default - # (this assumes the executing script is in a file - # /home/imw_user/data/foo.rb): + # IMW encourages you to work within the following directory + # structure for a dataset +my_dataset+: # - # dataset = IMW::Dataset.new - # dataset.path_to(:self) - # #=> '/home/imw_user/data' - # dataset.path_to(:ripd) - # #=> '/home/imw_user/data/ripd' - # dataset.path_to(:pkgd, 'final.tar.gz') - # #=> '/home/imw_user/data/pkgd/final.tar.gz' + # my_dataset/ + # |-- my_dataset.rb + # |-- ripd + # | `-- ... + # |-- rawd + # | `-- ... + # |-- fixd + # | `-- ... + # `-- pkgd + # `-- ... # - # Paths can be added + # Just like IMW itself, a dataset can manage a collection of paths. + # If <tt>my_dataset.rb</tt> defines a dataset: # - # dataset.add_path(:sorted_output, :mungd, 'sorted-file-3923.txt') - # dataset.path_to(:sorted_output) - # #=> '/home/imw_user/data/mungd/sorted-file-3923.txt' + # # my_dataset/my_dataset.rb + # dataset = IMW::Dataset.new(:my_dataset) # - # as well as removed (via +remove_path+). + # then the following paths will be defined: # - # == Defining Workflows + # dataset.path_to(:root) #=> my_dataset + # dataset.path_to(:script) #=> my_dataset/my_dataset.rb + # dataset.path_to(:ripd) #=> my_dataset/ripd + # dataset.path_to(:rawd) #=> my_dataset/rawd + # dataset.path_to(:fixd) #=> my_dataset/fixd + # dataset.path_to(:pkgd) #=> my_dataset/pkgd # - # IMW encourages you to think of transforming data as a network of - # interdependent steps (see IMW::Workflow). Each of IMW's five - # default steps maps to a named directory remembered by each - # dataset. + # Just like IMW itself, the +dataset+ supports adding path + # references # - # The following example shows why this is a useful abstraction as - # well as illustrating some of the other functionality in IMW. + # dataset.add_path(:raw_data, :ripd, 'raw_data.xml') + # dataset.path_to(:raw_data) #=> my_dataset/ripd/raw_data.xml # - # == Example Dataset + # as well as removed (via <tt>dataset.remove_path</tt>)). # - # The first step is to import IMW and create the dataset + # A subclass of IMW::Dataset can customize these paths be overriding + # IMW::Dataset#set_default_paths as well as define new ones by + # overriding IMW::Dataset#set_paths. # - # require 'rubygems' - # require 'imw' - # dataset = IMW::Dataset.new + # Setting paths can be skipped altogether by passing the + # <tt>:skip_paths</tt> option when instantiating a dataset: # - # You can pass in a handle (the name or "slug" for the dataset) as - # well as some options. Now define the steps you intend to take to - # complete the transformation: + # dataset = IMW::Dataset.new :my_dataset, :skip_paths => true # - # rip:: - # Data is collected from a source (+http+, +ftp+, database, &c.) - # and deposited in the <tt>:ripd</tt> directory of this dataset. + # == Utilizing Tasks # - # dataset.task :rip do - # IMW.open('http://econ.chimpu.edu/datasets/produce_prices.tar.bz2').cp_to_dir(dataset.path_to(:ripd)) - # #=> [ripd]/http/econ_chimpu_edu/datasets/produce_prices.tar.bz2 - # - # IMW::Rip.from_database :named => "weather_records", - # :at => "public.astro.chimpu.edu", - # :select => "* FROM hurricane_frequency" - # #=> [ripd]/sql/_edu/chimpu_astro_public/weather_records/select_from_hurricane_frequency-2009-02-16--15:30:26.tsv - # end + # An IMW::Dataset utilizes Rake to manage tasks needed to transform + # data. See IMW::Workflow for a description of the pre-defined + # tasks (+rip+, +parse+, +fix+, +package+). # - # Where <tt>[ripd]</tt> would be replaced by the IMW - # <tt>:ripd</tt> directory. The default <tt>:rip</tt> task is - # empty so If there's no need to rip data (perhaps it's already on - # disk?) then nothing needs to be done here. - # - # raw:: - # Managed by the <tt>:raw</tt> task, data is uncompressed and - # extracted (if necessary) and stored in a subdirectory of the - # <tt>:data</tt> directory named by the taxon and handle of this - # dataset. + # New tasks can be defined # - # dataset.task :raw do - # IMW::Raw.uncompress_and_extract File.join(dataset.path_to(:ripd),'http/_edu/chimpu_econ/datasets'), - # Dir[File.join(dataset.path_to(:ripd),'sql/_edu/chimpu_astro_public/**/*.tsv')].first - # #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/001.xml - # [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/002.xml - # [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/003.xml - # ... - # [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/select_from_hurricane_frequency-2009-02-16--15:30:26.tsv - # end + # dataset.task :get_authorization do + # # ... get an authorization token + # end # - # Where <tt>[data]</tt> would be replaced by the IMW - # <tt>:data</tt> directory. + # and hooked into the default tasks in the usual Rake manner # - # If this dataset didn't have a taxon - # (economics/alarming_trends) its files would be stored in a - # directory +recent_history_of_banana_prices+ just below the - # <tt>:data</tt> directory. + # dataset.task :rip => [:get_authorization] # - # fix:: - # Managed by the <tt>:fix</tt> task, transformations on the data - # are performed. IMW's method is to read data from a source - # format (XML, YAML, CSV, &c.) into Ruby objects with hash - # semantics. These objects might be based upon structs, - # ActiveRecord, DataMapper::Resource, FasterCSV...anything which - # can be accessed as <tt>thing.property</tt> (FIXME 'and' or 'or' - # ) <tt>thing[:property]</tt>: the Infinite Monkeywrench fits - # neatly into your toobox. + # A dataset also has methods for the workflow step tasks to make + # this easier # + # dataset.rip [:get_authorized] # - # # Open an output file in XML for writing - # output = IMW.open! File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv') - # #=> FasterCSV at [fixd]/economics/alarming_trends/recent_history_of_banana_prices/fixd/data_bananas_hurricanes.csv + # Tasks for a dataset can be accessed and invoked as follows # - # # A place to store the combined data - # correlations = [] + # dataset[:rip].invoke # - # dataset.task :fix do + # as well as by using the command line +imw+ tool. # - # # Return the contents of the weather data which has rows like - # # - # # 1 2008-09-01 4 - # # 2 2008-09-08 3 - # # 3 2008-08-15 3 - # # ... - # # - # weather_data = IMW.open(Dir[File.join(dataset.path_to(:rawd), '*.tsv')].first, - # :headers => ["ID","DATE","NUM_HURRICANES"]).entries - # #=> [#<FasterCSV::Row "ID":nil "DATE":Mon Sep 08 04:15:47 -0600 2008,"NUM_HURRICANES":4>, ... ] + # Defining tasks can be skipped altogether by passing the + # <tt>:skip_workflow</tt> option when instantiating a dataset # + # dataset = IMW::Dataset.new :my_dataset, :skip_workflow => true # - # # Return the matching data from the produce prices XML file which looks like - # # - # # <prices> - # # <price type="apple"> - # # <date>2008/09/01</date> - # # <amount>0.15</amount> - # # </price> - # # <price type="banana"> - # # <date>2008/09/01</date> - # # <amount>0.20</amount> - # # </price> - # # ... - # # </prices> - # parser = IMW::XMLParser.new :records => [ 'prices/price[@type="banana"]', - # { :week => 'date', - # :price => 'amount' }] + # == Working with Repositories # - # # Loop through the XML produce prices, mixing in the hurricane data, - # # and outputting new rows. - # Dir["#{dataset.path_to :rawd}*.xml"] each do |file| - # IMW.open file do |xml| #=> Hpricot::Doc - # parser.parse(xml).each do |record| - # num_hurricanes = weather_data.(lambda { nil }) {|id,week,num_hurricanes| week == record.week} - # output << [week,record[:price],num_hurricanes] - # end - # end - # end - # end + # A dataset can be added to a repository by passing the + # <tt>:repository</tt> option # - # package:: - # Data is packaged and compressed (if necessary) into a delivery - # format and deposited into the <tt>:pkgd</tt> directory. - # - # dataset.task :pkg do - # IMW.open(File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv')).compress! - # #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/pkgd/date_bananas_hurricanes.csv.bz2 - # end - # - # In the above, <tt>dataset.task</tt> behaves like - # <tt>Rake.task</tt>, merely defining a task and its dependencies - # without executing it via - # - # dataset.task(:pkg).invoke - # - # Since the <tt>:rip</tt>, <tt>:raw</tt>, <tt>:fix</tt>, and - # <tt>:pkg</tt> tasks depend upon each other, invoking <tt>:pkg</tt> - # will first cause <tt>:rip</tt> to run. - # - # By default, the tasks associated with a dataset are blank. All of - # IMW's functionality is available without defining tasks. Tasks - # simply provide a convenient scaffold for building a data - # transformation upon. - # - # Similarly, there is no requirement to use the directory structure - # outlined above. IMW's methods accept plain filenames and do the - # Right Thing where possible. The combination of tasks with - # matching directory structure is a suggested but not mandatory - # framework in which to program. + # repo = IMW::Repository.new + # dataset = IMW::Dataset.new :my_dataset, :repository => repo class Dataset - # The <tt>IMW::Workflow</tt> module contains pre-defined tasks for - # dataset processing. include IMW::Workflow - attr_accessor :handle, :options, :data + attr_accessor :handle, :options - def initialize options = {} + def initialize handle, options = {} @options = options - @handle = options[:handle] - initialize_workflow - set_root_paths - set_paths - set_tasks + @handle = handle + set_default_paths unless options[:skip_paths] + set_paths unless options[:skip_paths] + initialize_workflow unless options[:skip_workflow] + if options[:repository] + options[:repository][handle] = self + end end end end