lib/iron/import/importer.rb in iron-import-0.6.1 vs lib/iron/import/importer.rb in iron-import-0.7.0

- old
+ new

@@ -16,210 +16,539 @@ # importer.process do |row| # puts "Order #{row[:order_number]: #{row[:amount]} on #{row[:date]}" # end # end # -# The row.all? call will verify that each row passed contains a value for all defined columns. -# # A more realistic and complex example follows: # -# importer = Importer.build do +# Importer.build do +# # Define our columns and settings # column :order_number do -# match /order (num.*|id)/i +# header /order (num.*|id)/i +# type :int # end -# column :date -# column :amount +# column :date do +# type :date +# end +# column :amount do +# type :cents +# end +# +# # Filter out any rows missing an order number +# filter do |row| +# !row[:order_number].nil? +# end +# +# end.import('/path/to/file.csv', format: :csv) do |row| +# # Process each row as basically a hash of :column_key => value, +# # only called on import success +# Order.create(row.to_hash) +# +# end.on_error do +# # If we have any errors, do something +# raise error_summary # end # class Importer - # Array of error message or nil for each non-header row - attr_accessor :errors, :warnings - attr_accessor :sheets - attr_reader :data, :custom_reader + # Inner class for holding load-time data that gets reset on each load call + class Data + attr_accessor :start_row, :rows + def initialize + @start_row = nil + @rows = [] + end + end + + # Array of defined columns + attr_reader :columns + # Array of error messages collected during an import/process run + attr_accessor :errors + # Custom reader, if one has been defined using #on_file or #on_stream + attr_reader :custom_reader + # Set to the format selected during past import + attr_reader :format + # Import data + attr_reader :data + # Missing headers post-import + attr_reader :missing_headers + + # When true, skips header detection + dsl_flag :headerless + # Explicitly sets the row number (1-indexed) where data rows begin, + # usually left defaulted to nil to automatically start after the header + # row. + dsl_accessor :start_row + # Set to a block/lambda taking a parsed but unvalidated row as a hash, + # return true to keep, false to skip. + dsl_accessor :filter # Source file/stream encoding, assumes UTF-8 if none specified dsl_accessor :encoding + # Create a new importer! See #build for details on what to do + # in the block. def self.build(options = {}, &block) importer = Importer.new(options) importer.build(&block) importer end + # Ye standard constructor! def initialize(options = {}) + @scopes = {} @encoding = 'UTF-8' - @sheets = {} + @headerless = false + + @filter = nil + @columns = [] reset end - # Takes a block, and sets self to be importer instance, so you can - # just call #column, #sheet, etc. directly. + # Call to define the importer's column configuration and other setup options. + # + # The following builder options are available: + # + # importer = Importer.build do + # # Don't try to look for a header using column definitions, there is no header + # headerless! + # + # # Manually set the start row for data, defaults to nil + # # indicating that the data rows start immediatly following the header. + # start_row 4 + # + # # Define a filter that will skip unneeded rows. The filter command takes + # # a block that receives the parsed (but not validated!) row data as an + # # associative hash of :col_key => <parsed value>, and returns + # # true to keep the row or false to exclude it. + # filter do |row| + # row[:id].to_i > 5000 + # end + # + # # If you need to process a type of input that isn't built in, define + # # a custom reader with #on_file or #on_stream + # on_file do |path| + # ... read file at path, return array of each row's raw column values ... + # end + # + # # Got a multi-block format like Excel or HTML? You can optionally limit + # # searching by setting a scope or scopes to search: + # scope :xls, 'Sheet 2' + # # Or set a bunch of scopes in one go: + # scopes :html => ['div > table.data', 'table.aux-data'], + # :xls => [2, 'Orders'] + # + # # Of course, the main thing you're going to do is to define columns. See the + # # Column class' notes for options when defining a column. Note that + # # you can define columns using either hash-style: + # column :id, :type => :integer + # # or builder-style: + # column :name do + # header /company\s*name/i + # type :string + # end + # end def build(&block) DslProxy.exec(self, &block) if block self end - # For the common case where there is only one "sheet", e.g. CSV files. - def default_sheet(&block) - sheet(1, true, &block) - end - - # Access a Sheet definition by id (either number (1-N) or sheet name). - # Used during #build calls to define a sheet with a passed block, like so: + # Add a new column definition to our list, allows customizing the new + # column with a builder block. See Importer::Column docs for + # options. In lieu of a builder mode, you can pass the same values + # as key => value pairs in the options hash to this method, so: # - # Importer.build do - # sheet(1) do - # column :store_name - # column :store_address + # column(:foo) do + # type :string + # parse do |val| + # val.to_s.upcase # end - # sheet('Orders') do - # column :id - # column :price - # filter do |row| - # row[:price].prensent? - # end - # end # end - def sheet(id, create=true, &block) - # Find the sheet, creating it if needed (and requested!) - if @sheets[id].nil? - if create - @sheets[id] = Sheet.new(self, id) - else - return nil - end + # + # Is equivalent to: + # + # column(:foo, :type => :string, :parse => lambda {|val| val.to_s.upcase}) + # + # Use whichever you prefer! + def column(key, options_hash = {}, &block) + # Find existing column with key to allow re-opening an existing definition + col = @columns.detect {|c| c.key == key } + unless col + # if none found, add a new one + col = Column.new(self, key, options_hash) + @columns << col end - sheet = @sheets[id] - # Allow customization by DSL block if requested - sheet.build(&block) if block + # Customize if needed + DslProxy::exec(col, &block) if block - # Return the sheet - sheet + col end + + # Limit the search scope for a single format (:xls, :xlsx, :html, :custom) + # to the given value or values - the meaning and format of scopes is determined + # by that format's data reader. + def scope(format, *scopes) + @scopes[format] = scopes.flatten + end + + # Limit the search scope for more than one format at a time. For example, if + # you support both XLS and XLSX formats (and why wouldn't you?) then you + # could tell the importer to look only at the sheets named "Orders" and + # "Legacy Orders" like so: + # + # scopes :xls => ['Orders', 'Legacy Orders'], + # :xlsx => ['Orders', 'Legacy Orders'] + # + def scopes(map = :__read__) + if map == :__read__ + return @scopes + else + map.each_pair do |format, scope| + scope(format, scope) + end + end + end - # Define a custom file reader to implement your own sheet parsing. + # Define a custom file reader to implement your own parsing. Pass + # a block accepting a file path, and returning an array of arrays (rows of + # raw column values). Use #add_error(msg) to add a reading error. + # + # Adding a custom stream parser will change the importer's default + # format to :custom, though you can override it when calling #import as + # usual. + # + # Only one of #on_file or #on_stream needs to be implemented - the importer + # will cross convert as needed! + # + # Example: + # + # on_file do |path| + # # Read a file line by line + # File.readlines(path).collect do |line| + # # Each line has colon-separated values, so split 'em up + # line.split(/\s*:\s*/) + # end + # end + # def on_file(&block) @custom_reader = CustomReader.new(self) unless @custom_reader @custom_reader.set_reader(:file, block) end + # Just like #on_file, but for streams. Pass + # a block accepting a stream, and returning an array of arrays (rows of + # raw column values). Use #add_error(msg) to add a reading error. + # + # Example: + # + # on_stream do |stream| + # # Stream contains rows separated by a | char + # stream.readlines('|').collect do |line| + # # Each line has 3 fields of 10 characters each + # [line[0...10], line[10...20], line[20...30]] + # end + # end + # def on_stream(&block) @custom_reader = CustomReader.new(self) unless @custom_reader @custom_reader.set_reader(:stream, block) end - # Very, very commonly we only want to deal with the default sheet. In this case, - # let folks skip the sheet(n) do ... end block wrapper and just define columns - # against the main importer. Internally, proxy those calls to the first sheet. - def column(*args, &block) - default_sheet.column(*args, &block) - end - - # Ditto for filters - def filter(*args, &block) - default_sheet.filter(*args, &block) - end - - # Ditto for start row too - def start_row(row_num) - default_sheet.start_row(row_num) - end - - # More facading - def headerless! - default_sheet.headerless! - end - # First call to a freshly #build'd importer, this will read the file/stream/path supplied, # validate the required values, run custom validations... basically pre-parse and # massage the supplied data. It will return true on success, or false if one # or more errors were encountered and the import failed. # # You may supply various options for the import using the options hash. Supported # options include: # - # format: one of :auto, :csv, :xls, :xlsx, defaults to :auto, forces treating the supplied - # source as the specified format, or auto-detects if set to :auto + # format: one of :auto, :csv, :html, :xls, :xlsx, defaults to :auto, forces treating the supplied + # source as the specified format, or attempts to auto-detect if set to :auto + # scope: specify the search scope for the data/format, overriding any scope set with #scope # encoding: source encoding override, defaults to guessing based on input # - # Generally, you should be able to throw a source at it and it should work. The + # Generally, you should be able to throw a path or stream at it and it should work. The # options exist to allow overriding in cases where the automation heuristics # have failed and the input type is known by the caller. # + # If you're trying to import from a raw string, use Importer#import_string instead. + # # After #import has completed successfully, you can process the resulting data - # using #process or extract the raw data by calling #to_hash or #sheet(num).to_a - def import(path_or_stream, options = {}) + # using #process or extract the raw data by calling #to_a to get an array of row hashes + # + # Note that as of version 0.7.0, there is a more compact operation mode enabled by passing + # a block to this call: + # + # importer.import(...) do |row| + # # Process each row here + # end + # + # In this mode, the block is called with each row as in #process, conditionally on no + # errors. In addition, when a block is passed, true/false is not returned (as the + # block is already conditionally called). Instead, it will return the importer to allow + # chaining to #on_error or other calls. + def import(path_or_stream, options = {}, &block) # Clear all our load-time state, including all rows, header locations... you name it reset # Get the reader for this format default = @custom_reader ? :custom : :auto - format = options.delete(:format) { default } - if format == :custom + @format = options.delete(:format) { default } + if @format == :custom # Custom format selected, use our internal custom reader - @data = @custom_reader + @reader = @custom_reader - elsif format && format != :auto + elsif @format && @format != :auto # Explicit format requested - @data = DataReader::for_format(self, format) - unless @data - add_error("Unable to find format handler for format #{format} - aborting") - return - end + @reader = DataReader::for_format(self, @format) else # Auto select - @data = DataReader::for_source(self, path_or_stream) + @reader = DataReader::for_source(self, path_or_stream) + @format = @reader.format end + # Verify we got one + unless @reader + add_error("Unable to find format handler for format :#{format} on import of #{path_or_stream.class.name} source - aborting") + return + end + + # What scopes (if any) should we limit our searching to? + scopes = options.delete(:scope) { @scopes[@format] } + if scopes && !scopes.is_a?(Array) + scopes = [scopes] + end + # Read in the data! - @data.load(path_or_stream) + @reader.load(path_or_stream, scopes) do |raw_rows| + # Find our column layout, start of data, etc + if find_header(raw_rows) + # Now, run all the data and add it as a Row instance + raw_rows.each_with_index do |raw, index| + row_num = index + 1 + if row_num >= @data.start_row + add_row(row_num, raw) + end + end + # We've found a workable sheet/table/whatever, stop looking + true + + else + # This sheet/table/whatever didn't have the needed header, try + # the next one (if any) + false + end + end + + # If we have any missing headers, note that fact + if @missing_headers && @missing_headers.count > 0 + add_error("Unable to locate required column header for column(s): " + @missing_headers.collect{|c| ":#{c}"}.list_join(', ')) + end + + # If we're here with no errors, we rule! + success = !has_errors? + + if block + # New way, if block is passed, process it on success + process(&block) if success + self + else + # Old way, return result + success + end end - # Process a specific sheet, or the default sheet if none is provided. Your - # passed block will be handed one Row at a time. - def process(sheet_id = nil, &block) - s = sheet(sheet_id, false) || default_sheet - s.process(&block) + # Use this form of import for the common case of having a raw CSV or HTML string. + def import_string(string, options = {}, &block) + # Get a format here if needed + if options[:format].nil? + if @custom_reader + format = :custom + else + format = string.include?('<table') && string.include?('</tr>') ? :html : :csv + end + options[:format] = format + end + + # Do the import, converting the string to a stream + import(StringIO.new(string), options, &block) end + + # Call with a block accepting a single Importer::Row with contents that + # look like :column_key => <parsed value>. Any filtered rows + # will not be present. If you want to register an error, simply + # raise "some text" and it will be added to the importer's error + # list for display to the user, logging, or whatever. + def process + @data.rows.each do |row| + begin + yield row + rescue Exception => e + add_error(row, e.to_s) + end + end + end - def add_error(context, msg = nil) - if context.is_a?(String) && msg.nil? - msg = context - context = nil + def on_error(&block) + raise 'Invalid block passed to Importer#on_error: block may accept 0, 1 or 2 arguments' if block.arity > 2 + + if has_errors? + case block.arity + when 0 then DslProxy.exec(self, &block) + when 1 then DslProxy.exec(self, @errors, &block) + when 2 then DslProxy.exec(self, @errors, error_summary, &block) + end end - @errors << Error.new(context, msg) + + self end + # Process the raw values for the first rows in a sheet, + # and attempt to build a map of the column layout, and + # detect the first row of real data + def find_header(raw_rows) + if headerless? + # Use implicit or explicit column position when told to not look for a header + next_index = 0 + @columns.each do |col| + unless col.position.nil? + next_index = col.fixed_index + end + col.data.index = next_index + next_index += 1 + end + @data.start_row = @start_row || 1 + @missing_headers = nil + return true + + else + # Match by testing + missing = nil + raw_rows.each_with_index do |row, i| + # Um, have data? + next unless row + + # Set up for this iteration + remaining = @columns.dup + + # Step through this row's raw values, and look for a matching column for all columns + row.each_with_index do |val, i| + col = remaining.detect {|c| c.match_header?(val.to_s, i) } + if col + remaining -= [col] + col.data.index = i + end + end + + if remaining.empty? + # Found all columns, have a map, update our start row to be the next line and return! + @data.start_row = @start_row || i+2 + @missing_headers = nil + return true + else + missing = remaining if (missing.nil? || missing.count > remaining.count) + end + end + + # If we get here, we're hosed + @missing_headers = missing.collect(&:key) if @missing_headers.nil? || @missing_headers.count > missing.count + false + end + end + + # Add a new row to our stash, parsing/filtering/validating as we go! + def add_row(line, raw_data) + # Gracefully handle custom parsers that return nil for a row's data + raw_data ||= [] + # Add the row + row = Row.new(self, line) + + # Parse out the values + values = {} + @columns.each do |col| + index = col.data.index + raw_val = raw_data[index] + if col.parse + # Use custom parser if this row has one + val = col.parse_value(row, raw_val) + else + # Otherwise use our standard parser + val = @reader.parse_value(raw_val, col.type) + end + values[col.key] = val + end + + # Set the values and filter if needed + row.set_values(values) + return nil if @filter && !@filter.call(row) + + # Row is desired, now validate values + @columns.each do |col| + val = values[col.key] + col.validate_value(row, val) + end + + # We is good + @data.rows << row + row + end + + # When true, one or more errors have been recorded during this import/process + # cycle. def has_errors? @errors.any? end - def add_warning(context, msg) + # Add an error to our error list. Will result in a failed import. + def add_error(context, msg = nil) if context.is_a?(String) && msg.nil? msg = context context = nil end - @warnings << Error.new(context, msg) + @errors << Error.new(context, msg) end - def has_warnings? - @warnings.any? - end - - # Returns a human-readable summary of the errors present on the importer + # Returns a human-readable summary of the errors present on the importer, or + # nil if no errors are present def error_summary + # Simple case return nil unless has_errors? - @errors.collect(&:summary).list_join(', ') + + # Group by error text - we often get the same error dozens of times + list = {} + @errors.each do |err| + errs = list[err.text] || [] + errs << err + list[err.text] = errs + end + + # Build summary & return + list.values.collect do |errs| + summary = errs.first.summary + if errs.count == 1 + summary + else + errs.count.to_s + ' x ' + summary + end + end.list_join(', ') end + + # After calling #import, you can dump the final values for each row + # as an array of hashes. Useful in debugging! For general processing, + # use #process or the block form of #import instead. + def to_a + @data.rows.collect(&:values) + end protected def reset @errors = [] - @warnings = [] - @sheets.values.each(&:reset) + @missing_headers = nil + @format = nil + @reader = nil + @data = Data.new end end