require 'imw/utils'
require 'imw/dataset/workflow'
require 'imw/dataset/paths'
module IMW
# The IMW::Dataset class is useful organizing a complex data
# transformation because it is capable of managing a collection of
# paths and the interdependencies between subparts of the
# transformation.
#
# == Manipulating Paths
#
# Storing paths makes code shorter and more readable. By default
# (this assumes the executing script is in a file
# /home/imw_user/data/foo.rb):
#
# dataset = IMW::Dataset.new
# dataset.path_to(:self)
# #=> '/home/imw_user/data'
# dataset.path_to(:ripd)
# #=> '/home/imw_user/data/ripd'
# dataset.path_to(:pkgd, 'final.tar.gz')
# #=> '/home/imw_user/data/pkgd/final.tar.gz'
#
# Paths can be added
#
# dataset.add_path(:sorted_output, :mungd, 'sorted-file-3923.txt')
# dataset.path_to(:sorted_output)
# #=> '/home/imw_user/data/mungd/sorted-file-3923.txt'
#
# as well as removed (via +remove_path+).
#
# == Defining Workflows
#
# IMW encourages you to think of transforming data as a network of
# interdependent steps (see IMW::Workflow). Each of IMW's five
# default steps maps to a named directory remembered by each
# dataset.
#
# The following example shows why this is a useful abstraction as
# well as illustrating some of the other functionality in IMW.
#
# == Example Dataset
#
# The first step is to import IMW and create the dataset
#
# require 'rubygems'
# require 'imw'
# dataset = IMW::Dataset.new
#
# You can pass in a handle (the name or "slug" for the dataset) as
# well as some options. Now define the steps you intend to take to
# complete the transformation:
#
# rip::
# Data is collected from a source (+http+, +ftp+, database, &c.)
# and deposited in the :ripd directory of this dataset.
#
# dataset.task :rip do
# IMW.open('http://econ.chimpu.edu/datasets/produce_prices.tar.bz2').cp_to_dir(dataset.path_to(:ripd))
# #=> [ripd]/http/econ_chimpu_edu/datasets/produce_prices.tar.bz2
#
# IMW::Rip.from_database :named => "weather_records",
# :at => "public.astro.chimpu.edu",
# :select => "* FROM hurricane_frequency"
# #=> [ripd]/sql/_edu/chimpu_astro_public/weather_records/select_from_hurricane_frequency-2009-02-16--15:30:26.tsv
# end
#
# Where [ripd] would be replaced by the IMW
# :ripd directory. The default :rip task is
# empty so If there's no need to rip data (perhaps it's already on
# disk?) then nothing needs to be done here.
#
# raw::
# Managed by the :raw task, data is uncompressed and
# extracted (if necessary) and stored in a subdirectory of the
# :data directory named by the taxon and handle of this
# dataset.
#
# dataset.task :raw do
# IMW::Raw.uncompress_and_extract File.join(dataset.path_to(:ripd),'http/_edu/chimpu_econ/datasets'),
# Dir[File.join(dataset.path_to(:ripd),'sql/_edu/chimpu_astro_public/**/*.tsv')].first
# #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/001.xml
# [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/002.xml
# [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/003.xml
# ...
# [data]/economics/alarming_trends/recent_history_of_banana_prices/rawd/select_from_hurricane_frequency-2009-02-16--15:30:26.tsv
# end
#
# Where [data] would be replaced by the IMW
# :data directory.
#
# If this dataset didn't have a taxon
# (economics/alarming_trends) its files would be stored in a
# directory +recent_history_of_banana_prices+ just below the
# :data directory.
#
# fix::
# Managed by the :fix task, transformations on the data
# are performed. IMW's method is to read data from a source
# format (XML, YAML, CSV, &c.) into Ruby objects with hash
# semantics. These objects might be based upon structs,
# ActiveRecord, DataMapper::Resource, FasterCSV...anything which
# can be accessed as thing.property (FIXME 'and' or 'or'
# ) thing[:property]: the Infinite Monkeywrench fits
# neatly into your toobox.
#
#
# # Open an output file in XML for writing
# output = IMW.open! File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv')
# #=> FasterCSV at [fixd]/economics/alarming_trends/recent_history_of_banana_prices/fixd/data_bananas_hurricanes.csv
#
# # A place to store the combined data
# correlations = []
#
# dataset.task :fix do
#
# # Return the contents of the weather data which has rows like
# #
# # 1 2008-09-01 4
# # 2 2008-09-08 3
# # 3 2008-08-15 3
# # ...
# #
# weather_data = IMW.open(Dir[File.join(dataset.path_to(:rawd), '*.tsv')].first,
# :headers => ["ID","DATE","NUM_HURRICANES"]).entries
# #=> [#, ... ]
#
#
# # Return the matching data from the produce prices XML file which looks like
# #
# #
# #
# # 2008/09/01
# # 0.15
# #
# #
# # 2008/09/01
# # 0.20
# #
# # ...
# #
# parser = IMW::XMLParser.new :records => [ 'prices/price[@type="banana"]',
# { :week => 'date',
# :price => 'amount' }]
#
# # Loop through the XML produce prices, mixing in the hurricane data,
# # and outputting new rows.
# Dir["#{dataset.path_to :rawd}*.xml"] each do |file|
# IMW.open file do |xml| #=> Hpricot::Doc
# parser.parse(xml).each do |record|
# num_hurricanes = weather_data.(lambda { nil }) {|id,week,num_hurricanes| week == record.week}
# output << [week,record[:price],num_hurricanes]
# end
# end
# end
# end
#
# package::
# Data is packaged and compressed (if necessary) into a delivery
# format and deposited into the :pkgd directory.
#
# dataset.task :pkg do
# IMW.open(File.join(dataset.path_to(:fixd), 'date_bananas_hurricanes.csv')).compress!
# #=> [data]/economics/alarming_trends/recent_history_of_banana_prices/pkgd/date_bananas_hurricanes.csv.bz2
# end
#
# In the above, dataset.task behaves like
# Rake.task, merely defining a task and its dependencies
# without executing it via
#
# dataset.task(:pkg).invoke
#
# Since the :rip, :raw, :fix, and
# :pkg tasks depend upon each other, invoking :pkg
# will first cause :rip to run.
#
# By default, the tasks associated with a dataset are blank. All of
# IMW's functionality is available without defining tasks. Tasks
# simply provide a convenient scaffold for building a data
# transformation upon.
#
# Similarly, there is no requirement to use the directory structure
# outlined above. IMW's methods accept plain filenames and do the
# Right Thing where possible. The combination of tasks with
# matching directory structure is a suggested but not mandatory
# framework in which to program.
class Dataset
# The IMW::Workflow module contains pre-defined tasks for
# dataset processing.
include IMW::Workflow
attr_accessor :handle, :options, :data
def initialize options = {}
@options = options
@handle = options[:handle]
initialize_workflow
set_root_paths
set_paths
set_tasks
end
end
end