importer.rb in iron-import-0.7.0

- old
+ new
@@ -16,210 +16,539 @@
 #     importer.process do |row|
 #       puts "Order #{row[:order_number]: #{row[:amount]} on #{row[:date]}"
 #     end
 #   end
 #
-# The row.all? call will verify that each row passed contains a value for all defined columns.
-#
 # A more realistic and complex example follows:
 #
-#   importer = Importer.build do
+#   Importer.build do
+#     # Define our columns and settings
 #     column :order_number do
-#       match /order (num.*|id)/i
+#       header /order (num.*|id)/i
+#       type :int
 #     end
-#     column :date
-#     column :amount
+#     column :date do
+#       type :date
+#     end
+#     column :amount do
+#       type :cents
+#     end
+#  
+#     # Filter out any rows missing an order number
+#     filter do |row|
+#       !row[:order_number].nil?
+#     end
+#
+#   end.import('/path/to/file.csv', format: :csv) do |row|
+#     # Process each row as basically a hash of :column_key => value,
+#     # only called on import success
+#     Order.create(row.to_hash)
+#
+#   end.on_error do
+#     # If we have any errors, do something
+#     raise error_summary
 #   end
 #
 class Importer 
 
-  # Array of error message or nil for each non-header row
-  attr_accessor :errors, :warnings
-  attr_accessor :sheets
-  attr_reader :data, :custom_reader
+  # Inner class for holding load-time data that gets reset on each load call
+  class Data
+    attr_accessor :start_row, :rows
+    def initialize
+      @start_row = nil
+      @rows = []
+    end
+  end
+
+  # Array of defined columns
+  attr_reader :columns
+  # Array of error messages collected during an import/process run
+  attr_accessor :errors
+  # Custom reader, if one has been defined using #on_file or #on_stream
+  attr_reader :custom_reader
+  # Set to the format selected during past import
+  attr_reader :format
+  # Import data
+  attr_reader :data
+  # Missing headers post-import
+  attr_reader :missing_headers
+
+  # When true, skips header detection
+  dsl_flag :headerless
+  # Explicitly sets the row number (1-indexed) where data rows begin,
+  # usually left defaulted to nil to automatically start after the header
+  # row.
+  dsl_accessor :start_row
+  # Set to a block/lambda taking a parsed but unvalidated row as a hash,
+  # return true to keep, false to skip.
+  dsl_accessor :filter
   # Source file/stream encoding, assumes UTF-8 if none specified
   dsl_accessor :encoding
 
+  # Create a new importer!  See #build for details on what to do
+  # in the block.
   def self.build(options = {}, &block)
     importer = Importer.new(options)
     importer.build(&block)
     importer
   end
 
+  # Ye standard constructor!
   def initialize(options = {})
+    @scopes = {}
     @encoding = 'UTF-8'
-    @sheets = {}
+    @headerless = false
+
+    @filter = nil
+    @columns = []
     
     reset
   end
   
-  # Takes a block, and sets self to be importer instance, so you can
-  # just call #column, #sheet, etc. directly.
+  # Call to define the importer's column configuration and other setup options.
+  #
+  # The following builder options are available:
+  # 
+  #   importer = Importer.build do
+  #     # Don't try to look for a header using column definitions, there is no header
+  #     headerless!
+  #
+  #     # Manually set the start row for data, defaults to nil
+  #     # indicating that the data rows start immediatly following the header.
+  #     start_row 4
+  #
+  #     # Define a filter that will skip unneeded rows.  The filter command takes
+  #     # a block that receives the parsed (but not validated!) row data as an 
+  #     # associative hash of :col_key => <parsed value>, and returns 
+  #     # true to keep the row or false to exclude it.
+  #     filter do |row|
+  #       row[:id].to_i > 5000
+  #     end
+  #
+  #     # If you need to process a type of input that isn't built in, define
+  #     # a custom reader with #on_file or #on_stream
+  #     on_file do |path|
+  #       ... read file at path, return array of each row's raw column values ...
+  #     end
+  #
+  #     # Got a multi-block format like Excel or HTML?  You can optionally limit 
+  #     # searching by setting a scope or scopes to search:
+  #     scope :xls, 'Sheet 2'
+  #     # Or set a bunch of scopes in one go:
+  #     scopes :html => ['div > table.data', 'table.aux-data'],
+  #            :xls => [2, 'Orders']
+  #
+  #     # Of course, the main thing you're going to do is to define columns.  See the
+  #     # Column class' notes for options when defining a column.  Note that
+  #     # you can define columns using either hash-style:
+  #     column :id, :type => :integer
+  #     # or builder-style:
+  #     column :name do
+  #       header /company\s*name/i
+  #       type :string
+  #     end
+  #   end
   def build(&block)
     DslProxy.exec(self, &block) if block
     self
   end
   
-  # For the common case where there is only one "sheet", e.g. CSV files.
-  def default_sheet(&block)
-    sheet(1, true, &block)
-  end
-  
-  # Access a Sheet definition by id (either number (1-N) or sheet name).
-  # Used during #build calls to define a sheet with a passed block, like so:
+  # Add a new column definition to our list, allows customizing the new
+  # column with a builder block.  See Importer::Column docs for 
+  # options.  In lieu of a builder mode, you can pass the same values
+  # as key => value pairs in the options hash to this method, so:
   #
-  #   Importer.build do
-  #     sheet(1) do
-  #       column :store_name
-  #       column :store_address
+  #   column(:foo) do
+  #     type :string
+  #     parse do |val|
+  #       val.to_s.upcase
   #     end
-  #     sheet('Orders') do
-  #       column :id
-  #       column :price
-  #       filter do |row|
-  #         row[:price].prensent?
-  #       end
-  #     end
   #   end
-  def sheet(id, create=true, &block)
-    # Find the sheet, creating it if needed (and requested!)
-    if @sheets[id].nil?
-      if create
-        @sheets[id] = Sheet.new(self, id)
-      else
-        return nil
-      end
+  # 
+  # Is equivalent to:
+  #
+  #   column(:foo, :type => :string, :parse => lambda {|val| val.to_s.upcase})
+  #
+  # Use whichever you prefer!
+  def column(key, options_hash = {}, &block)
+    # Find existing column with key to allow re-opening an existing definition
+    col = @columns.detect {|c| c.key == key }
+    unless col
+      # if none found, add a new one
+      col = Column.new(self, key, options_hash)
+      @columns << col
     end
-    sheet = @sheets[id]
     
-    # Allow customization by DSL block if requested
-    sheet.build(&block) if block
+    # Customize if needed
+    DslProxy::exec(col, &block) if block
     
-    # Return the sheet
-    sheet
+    col
   end
+  
+  # Limit the search scope for a single format (:xls, :xlsx, :html, :custom)
+  # to the given value or values - the meaning and format of scopes is determined
+  # by that format's data reader.
+  def scope(format, *scopes)
+    @scopes[format] = scopes.flatten
+  end
+  
+  # Limit the search scope for more than one format at a time.  For example, if
+  # you support both XLS and XLSX formats (and why wouldn't you?) then you
+  # could tell the importer to look only at the sheets named "Orders" and
+  # "Legacy Orders" like so:
+  #
+  #   scopes :xls => ['Orders', 'Legacy Orders'],
+  #          :xlsx => ['Orders', 'Legacy Orders']
+  #
+  def scopes(map = :__read__)
+    if map == :__read__
+      return @scopes
+    else
+      map.each_pair do |format, scope|
+        scope(format, scope)
+      end
+    end
+  end
 
-  # Define a custom file reader to implement your own sheet parsing.
+  # Define a custom file reader to implement your own parsing.  Pass
+  # a block accepting a file path, and returning an array of arrays (rows of
+  # raw column values).  Use #add_error(msg) to add a reading error.
+  # 
+  # Adding a custom stream parser will change the importer's default
+  # format to :custom, though you can override it when calling #import as 
+  # usual.
+  #
+  # Only one of #on_file or #on_stream needs to be implemented - the importer
+  # will cross convert as needed!
+  #
+  # Example:
+  #
+  #   on_file do |path|
+  #     # Read a file line by line
+  #     File.readlines(path).collect do |line|
+  #       # Each line has colon-separated values, so split 'em up
+  #       line.split(/\s*:\s*/)
+  #     end
+  #   end
+  #
   def on_file(&block)
     @custom_reader = CustomReader.new(self) unless @custom_reader
     @custom_reader.set_reader(:file, block)
   end
   
+  # Just like #on_file, but for streams.  Pass
+  # a block accepting a stream, and returning an array of arrays (rows of
+  # raw column values).  Use #add_error(msg) to add a reading error.
+  # 
+  # Example:
+  #
+  #   on_stream do |stream|
+  #     # Stream contains rows separated by a | char
+  #     stream.readlines('|').collect do |line|
+  #       # Each line has 3 fields of 10 characters each
+  #       [line[0...10], line[10...20], line[20...30]]
+  #     end
+  #   end
+  #
   def on_stream(&block)
     @custom_reader = CustomReader.new(self) unless @custom_reader
     @custom_reader.set_reader(:stream, block)
   end
   
-  # Very, very commonly we only want to deal with the default sheet.  In this case,
-  # let folks skip the sheet(n) do ... end block wrapper and just define columns
-  # against the main importer.  Internally, proxy those calls to the first sheet.
-  def column(*args, &block)
-    default_sheet.column(*args, &block)
-  end
-  
-  # Ditto for filters
-  def filter(*args, &block)
-    default_sheet.filter(*args, &block)
-  end
-  
-  # Ditto for start row too
-  def start_row(row_num)
-    default_sheet.start_row(row_num)
-  end
-  
-  # More facading
-  def headerless!
-    default_sheet.headerless!
-  end
-  
   # First call to a freshly #build'd importer, this will read the file/stream/path supplied,
   # validate the required values, run custom validations... basically pre-parse and
   # massage the supplied data.  It will return true on success, or false if one
   # or more errors were encountered and the import failed.
   #
   # You may supply various options for the import using the options hash.  Supported
   # options include:
   #
-  #   format: one of :auto, :csv, :xls, :xlsx, defaults to :auto, forces treating the supplied
-  #           source as the specified format, or auto-detects if set to :auto
+  #   format: one of :auto, :csv, :html, :xls, :xlsx, defaults to :auto, forces treating the supplied
+  #           source as the specified format, or attempts to auto-detect if set to :auto
+  #   scope: specify the search scope for the data/format, overriding any scope set with #scope
   #   encoding: source encoding override, defaults to guessing based on input
   #   
-  # Generally, you should be able to throw a source at it and it should work.  The
+  # Generally, you should be able to throw a path or stream at it and it should work.  The
   # options exist to allow overriding in cases where the automation heuristics
   # have failed and the input type is known by the caller.
   #
+  # If you're trying to import from a raw string, use Importer#import_string instead.
+  #
   # After #import has completed successfully, you can process the resulting data
-  # using #process or extract the raw data by calling #to_hash or #sheet(num).to_a
-  def import(path_or_stream, options = {})
+  # using #process or extract the raw data by calling #to_a to get an array of row hashes
+  #
+  # Note that as of version 0.7.0, there is a more compact operation mode enabled by passing 
+  # a block to this call:
+  #
+  #    importer.import(...) do |row|
+  #      # Process each row here
+  #    end
+  #
+  # In this mode, the block is called with each row as in #process, conditionally on no
+  # errors.  In addition, when a block is passed, true/false is not returned (as the
+  # block is already conditionally called).  Instead, it will return the importer to allow
+  # chaining to #on_error or other calls.
+  def import(path_or_stream, options = {}, &block)
     # Clear all our load-time state, including all rows, header locations... you name it
     reset
     
     # Get the reader for this format
     default = @custom_reader ? :custom : :auto
-    format = options.delete(:format) { default }
-    if format == :custom
+    @format = options.delete(:format) { default }
+    if @format == :custom
       # Custom format selected, use our internal custom reader
-      @data = @custom_reader
+      @reader = @custom_reader
       
-    elsif format && format != :auto
+    elsif @format && @format != :auto
       # Explicit format requested
-      @data = DataReader::for_format(self, format)
-      unless @data
-        add_error("Unable to find format handler for format #{format} - aborting")
-        return
-      end
+      @reader = DataReader::for_format(self, @format)
       
     else
       # Auto select
-      @data = DataReader::for_source(self, path_or_stream)
+      @reader = DataReader::for_source(self, path_or_stream)
+      @format = @reader.format
     end
 
+    # Verify we got one
+    unless @reader
+      add_error("Unable to find format handler for format :#{format} on import of #{path_or_stream.class.name} source - aborting")
+      return
+    end
+    
+    # What scopes (if any) should we limit our searching to?
+    scopes = options.delete(:scope) { @scopes[@format] }
+    if scopes && !scopes.is_a?(Array)
+      scopes = [scopes]
+    end
+
     # Read in the data!
-    @data.load(path_or_stream)
+    @reader.load(path_or_stream, scopes) do |raw_rows|
+      # Find our column layout, start of data, etc
+      if find_header(raw_rows)
+        # Now, run all the data and add it as a Row instance
+        raw_rows.each_with_index do |raw, index|
+          row_num = index + 1
+          if row_num >= @data.start_row
+            add_row(row_num, raw)
+          end
+        end
+        # We've found a workable sheet/table/whatever, stop looking
+        true
+        
+      else
+        # This sheet/table/whatever didn't have the needed header, try
+        # the next one (if any)
+        false
+      end
+    end
+    
+    # If we have any missing headers, note that fact
+    if @missing_headers && @missing_headers.count > 0
+      add_error("Unable to locate required column header for column(s): " + @missing_headers.collect{|c| ":#{c}"}.list_join(', '))
+    end
+    
+    # If we're here with no errors, we rule!
+    success = !has_errors?
+    
+    if block
+      # New way, if block is passed, process it on success
+      process(&block) if success
+      self
+    else
+      # Old way, return result
+      success
+    end
   end
 
-  # Process a specific sheet, or the default sheet if none is provided.  Your
-  # passed block will be handed one Row at a time.
-  def process(sheet_id = nil, &block)
-    s = sheet(sheet_id, false) || default_sheet
-    s.process(&block)
+  # Use this form of import for the common case of having a raw CSV or HTML string.
+  def import_string(string, options = {}, &block)
+    # Get a format here if needed
+    if options[:format].nil?
+      if @custom_reader
+        format = :custom
+      else
+        format = string.include?('<table') && string.include?('</tr>') ? :html : :csv
+      end
+      options[:format] = format
+    end
+    
+    # Do the import, converting the string to a stream
+    import(StringIO.new(string), options, &block)
   end
+
+  # Call with a block accepting a single Importer::Row with contents that
+  # look like :column_key => <parsed value>.  Any filtered rows
+  # will not be present.  If you want to register an error, simply 
+  # raise "some text" and it will be added to the importer's error
+  # list for display to the user, logging, or whatever.
+  def process
+    @data.rows.each do |row|
+      begin
+        yield row
+      rescue Exception => e
+        add_error(row, e.to_s)
+      end
+    end
+  end
   
-  def add_error(context, msg = nil)
-    if context.is_a?(String) && msg.nil?
-      msg = context
-      context = nil
+  def on_error(&block)
+    raise 'Invalid block passed to Importer#on_error: block may accept 0, 1 or 2 arguments' if block.arity > 2
+    
+    if has_errors?
+      case block.arity
+      when 0 then DslProxy.exec(self, &block)
+      when 1 then DslProxy.exec(self, @errors, &block)
+      when 2 then DslProxy.exec(self, @errors, error_summary, &block)
+      end
     end
-    @errors << Error.new(context, msg)
+    
+    self
   end
   
+  # Process the raw values for the first rows in a sheet,
+  # and attempt to build a map of the column layout, and
+  # detect the first row of real data
+  def find_header(raw_rows)
+    if headerless?
+      # Use implicit or explicit column position when told to not look for a header
+      next_index = 0
+      @columns.each do |col|
+        unless col.position.nil?
+          next_index = col.fixed_index
+        end
+        col.data.index = next_index
+        next_index += 1
+      end
+      @data.start_row = @start_row || 1
+      @missing_headers = nil
+      return true
+      
+    else
+      # Match by testing
+      missing = nil
+      raw_rows.each_with_index do |row, i|
+        # Um, have data?
+        next unless row
+        
+        # Set up for this iteration
+        remaining = @columns.dup
+    
+        # Step through this row's raw values, and look for a matching column for all columns
+        row.each_with_index do |val, i|
+          col = remaining.detect {|c| c.match_header?(val.to_s, i) }
+          if col
+            remaining -= [col]
+            col.data.index = i
+          end
+        end
+        
+        if remaining.empty?
+          # Found all columns, have a map, update our start row to be the next line and return!
+          @data.start_row = @start_row || i+2
+          @missing_headers = nil
+          return true
+        else
+          missing = remaining if (missing.nil? || missing.count > remaining.count)
+        end
+      end
+      
+      # If we get here, we're hosed
+      @missing_headers = missing.collect(&:key) if @missing_headers.nil? || @missing_headers.count > missing.count
+      false
+    end
+  end
+
+  # Add a new row to our stash, parsing/filtering/validating as we go!
+  def add_row(line, raw_data)
+    # Gracefully handle custom parsers that return nil for a row's data
+    raw_data ||= []
+    # Add the row
+    row = Row.new(self, line)
+    
+    # Parse out the values
+    values = {}
+    @columns.each do |col|
+      index = col.data.index
+      raw_val = raw_data[index]
+      if col.parse
+        # Use custom parser if this row has one
+        val = col.parse_value(row, raw_val)
+      else
+        # Otherwise use our standard parser
+        val = @reader.parse_value(raw_val, col.type)
+      end
+      values[col.key] = val
+    end
+
+    # Set the values and filter if needed
+    row.set_values(values)
+    return nil if @filter && !@filter.call(row)
+
+    # Row is desired, now validate values
+    @columns.each do |col|
+      val = values[col.key]
+      col.validate_value(row, val)
+    end
+    
+    # We is good
+    @data.rows << row
+    row
+  end
+
+  # When true, one or more errors have been recorded during this import/process
+  # cycle.
   def has_errors?
     @errors.any?
   end
   
-  def add_warning(context, msg)
+  # Add an error to our error list.  Will result in a failed import.
+  def add_error(context, msg = nil)
     if context.is_a?(String) && msg.nil?
       msg = context
       context = nil
     end
-    @warnings << Error.new(context, msg)
+    @errors << Error.new(context, msg)
   end
   
-  def has_warnings?
-    @warnings.any?
-  end
-  
-  # Returns a human-readable summary of the errors present on the importer
+  # Returns a human-readable summary of the errors present on the importer, or
+  # nil if no errors are present
   def error_summary
+    # Simple case
     return nil unless has_errors?
-    @errors.collect(&:summary).list_join(', ')
+
+    # Group by error text - we often get the same error dozens of times
+    list = {}
+    @errors.each do |err|
+      errs = list[err.text] || []
+      errs << err
+      list[err.text] = errs
+    end
+    
+    # Build summary & return
+    list.values.collect do |errs|
+      summary = errs.first.summary
+      if errs.count == 1
+        summary
+      else
+        errs.count.to_s + ' x ' + summary
+      end
+    end.list_join(', ')
   end
+  
+  # After calling #import, you can dump the final values for each row
+  # as an array of hashes.  Useful in debugging!  For general processing,
+  # use #process or the block form of #import instead.
+  def to_a
+    @data.rows.collect(&:values)
+  end
 
   protected
   
   def reset
     @errors = []
-    @warnings = []
-    @sheets.values.each(&:reset)
+    @missing_headers = nil
+    @format = nil
+    @reader = nil
+    @data = Data.new
   end
   
 end