require 'imw/dataset/workflow'
require 'imw/dataset/paths'
module IMW
# The IMW::Dataset represents a common object in which paths, data
# resources, and various tasks can be intermingled to define a
# complex transformation of data.
#
# == Organizing Paths
#
# IMW encourages you to work within the following directory
# structure for a dataset +my_dataset+:
#
# my_dataset/
# |-- my_dataset.rb
# |-- ripd
# | `-- ...
# |-- rawd
# | `-- ...
# |-- fixd
# | `-- ...
# `-- pkgd
# `-- ...
#
# Just like IMW itself, a dataset can manage a collection of paths.
# If my_dataset.rb defines a dataset:
#
# # my_dataset/my_dataset.rb
# dataset = IMW::Dataset.new(:my_dataset)
#
# then the following paths will be defined:
#
# dataset.path_to(:root) #=> my_dataset
# dataset.path_to(:script) #=> my_dataset/my_dataset.rb
# dataset.path_to(:ripd) #=> my_dataset/ripd
# dataset.path_to(:rawd) #=> my_dataset/rawd
# dataset.path_to(:fixd) #=> my_dataset/fixd
# dataset.path_to(:pkgd) #=> my_dataset/pkgd
#
# Just like IMW itself, the +dataset+ supports adding path
# references
#
# dataset.add_path(:raw_data, :ripd, 'raw_data.xml')
# dataset.path_to(:raw_data) #=> my_dataset/ripd/raw_data.xml
#
# as well as removed (via dataset.remove_path)).
#
# A subclass of IMW::Dataset can customize these paths be overriding
# IMW::Dataset#set_default_paths as well as define new ones by
# overriding IMW::Dataset#set_paths.
#
# Setting paths can be skipped altogether by passing the
# :skip_paths option when instantiating a dataset:
#
# dataset = IMW::Dataset.new :my_dataset, :skip_paths => true
#
# == Utilizing Tasks
#
# An IMW::Dataset utilizes Rake to manage tasks needed to transform
# data. See IMW::Workflow for a description of the pre-defined
# tasks (+rip+, +parse+, +fix+, +package+).
#
# New tasks can be defined
#
# dataset.task :get_authorization do
# # ... get an authorization token
# end
#
# and hooked into the default tasks in the usual Rake manner
#
# dataset.task :rip => [:get_authorization]
#
# A dataset also has methods for the workflow step tasks to make
# this easier
#
# dataset.rip [:get_authorized]
#
# Tasks for a dataset can be accessed and invoked as follows
#
# dataset[:rip].invoke
#
# as well as by using the command line +imw+ tool.
#
# Defining tasks can be skipped altogether by passing the
# :skip_workflow option when instantiating a dataset
#
# dataset = IMW::Dataset.new :my_dataset, :skip_workflow => true
#
# == Working with Repositories
#
# A dataset can be added to a repository by passing the
# :repository option
#
# repo = IMW::Repository.new
# dataset = IMW::Dataset.new :my_dataset, :repository => repo
class Dataset
# The handle this dataset goes by. Used for identifying it within
# a repository.
attr_accessor :handle
# Options for this dataset.
attr_accessor :options
def initialize handle, options = {}
@options = options
@handle = handle
set_default_paths unless options[:skip_paths]
set_paths unless options[:skip_paths]
initialize_workflow unless options[:skip_workflow]
if options[:repository]
options[:repository][handle] = self
end
end
# Provides this dataset with a workflow of tasks managed by Rake.
include IMW::Workflow
# Provides this dataset with DSL like methods to construct a
# schema in an IMW file.
include IMW::Metadata::DSL
end
end