require 'ostruct' require 'rake' module IMW # An IMW version of Rake::Task Task = Class.new(Rake::Task) # An IMW subclass of Rake:FileTask FileTask = Class.new(Rake::FileTask) # An IMW subclass of Rake::FileCreationTask FileCreationTask = Class.new(Rake::FileCreationTask) # IMW encourages you to view a data transformation as a series of # interdependent steps. # # By default, IMW defines four main steps in such a transformation: # +rip+, +parse+, +fix+, and +package+. # # Each step is associated with a directory on disk in which it keeps # its files: +ripd+, +prsd+, +fixd+, and +pkgd+. # # The steps are: # # rip:: # Obtain data via HTTP, FTP, SCP, RSYNC, database query, &c and # store the results in +ripd+. # # parse:: # Parse data into a structured form using a library (JSON, YAML, # &c.) or using your own parser (XML, flat files, &c.) and store # the results in +prsd+ # # fix:: # Combine, filter, reconcile, and transform already structured # data into a desired form and store the results in +fixd+. # # package:: # Archive, compress, and deliver data in its final form to some # location (HTTP, FTP, SCP, RSYNC, S3, EBS, &c.), optionally # storing the ouptut in +pkgd+. # # Each step depends upon the one before it. The steps are blank by # default so there's no need to write code for steps you don't need # to use. You can also define your own steps (using +task+ just # like in Rake) and hook them into these pre-defined steps (or # not...). # # A dataset also has an :initialize task (which by default # just creates the directories for these steps) which you can use to # hook in your own initialization tasks by making it depend on them. # # A subclass of IMW::Dataset can customize how tasks are defined by # overriding +define_workflow_tasks+, among other methods, and # introduce new tasks by overriding +define_tasks+. module Workflow include Rake::TaskManager # Default options passed to Rake. Any class including # the Rake::TaskManager module must define a constant by # this name. DEFAULT_OPTIONS = { :dry_run => false, :trace => false, :verbose => false } # Return a new (or existing) IMW::Task with the given # +name+. Dependencies can be declared and a block passed in just # as in Rake. # # @param [Hash, Symbol, String] deps the name of the task (if a # Symbol or String) or the name of the task mapped to an Array of # dependencies (if a Hash) # # @return [IMW::Task] the task def task deps, &block self.define_task IMW::Task, deps, &block end # Return a new (or existing) IMW::FileTask with the given # +path+. Dependencies can be declared and a block passed in just # as in Rake. # # @param [String, IMW::Resource] path the path to the file # @return [IMW::FileTask] the task def file path, &block path = path.respond_to?(:path) ? path.path : path self.define_task IMW::FileTask, path, &block end # Return a new (or existing) IMW::FileCreationTask with the given # +path+. Dependencies can be declared and a block passed in just # as in Rake. # # @param [String, IMW::Resource] path the path to the file # @return [IMW::FileCreationTask] the task def file_create path, &block path = path.respond_to?(:path) ? path.path : path self.define_task IMW::FileCreationTask, path, &block end # Override this method to define default tasks for a subclass of # IMW::Dataset. def define_tasks end # The standard IMW workflow steps. # # @return [Array] the workflow step names def workflow_steps [:rip, :parse, :fix, :package] end # The steps of the IMW workflow each correspond to a directory in # which it is customary that they deposit their files once # they are finished processing (so ripped files wind up in # the +ripd+ directory, packaged files in the +pkgd+ directory, # and so on). # # @return [Array] the workflow directory names def workflow_dirs [:ripd, :rawd, :fixd, :pkgd] end protected # Convenience method for defining tasks for this workflow. # # @param [Hash, Symbol, String] deps the name of the task (if a # Symbol or String) or the name of the task mapped to an Array of # dependencies (if a Hash) # @param [String] comment the comment to associate to the task # @return [IMW::Task] the task def define_workflow_task deps, comment, &block @last_description = comment define_task(IMW::Task, deps, &block) end # Create all the instance variables required by Rake::TaskManager # and define default tasks for this dataset. def initialize_workflow @tasks = Hash.new @rules = Array.new @scope = Array.new @last_description = nil @options = OpenStruct.new(DEFAULT_OPTIONS) define_initialize_task define_workflow_tasks define_workflow_task_methods define_clean_task define_tasks end # Defines the :initialize task. The only other task # hooked into :initialize is the # :create_workflow_dirs task which creates the workflow # directories for this dataset. def define_initialize_task define_workflow_task({:create_directories => []}, "Creates workflow directories for this dataset.") do workflow_dirs.each do |dir| FileUtils.mkdir_p(path_to(dir)) unless File.exist?(path_to(dir)) end end define_workflow_task({ :initialize => [:create_directories] }, "Initialize this dataset.") end # Creates a task :clean which removes dataset's # workflow directories. def define_clean_task define_workflow_task :clean, "Remove the workflow directories for this dataset." do workflow_dirs.each do |dir| FileUtils.rm_rf(path_to(dir)) if File.exist?(path_to(dir)) end end end # Creates the task dependency chain :package => :fix => # :parse => :rip => :initialize of the # IMW::Workflow. def define_workflow_tasks define_workflow_task({:rip => [:create_directories]}, "Obtain data from some source." ) define_workflow_task({:parse => [:rip]}, "Parse data into a structured form." ) define_workflow_task({:fix => [:parse]}, "Munge parsed data into desired form." ) define_workflow_task({:package => [:fix]}, "Package dataset in final form." ) end # Dynamically define methods for each of the workflow steps which # act as shorcuts for accessing the corresponding tasks. def define_workflow_task_methods workflow_steps.each do |step| self.class.class_eval < deps, &block) end RUBY end end end end