class DarwinCore module Ingester attr_reader :data, :properties, :encoding, :fields_separator attr_reader :file_path, :fields, :line_separator, :quote_character, :ignore_headers def read(batch_size = 10000) res = [] errors = [] index_fix = 1 args = {:col_sep => @field_separator} args.merge!({:quote_char => @quote_character}) if @quote_character != '' CSV.open(@file_path, args).each_with_index do |r, i| index_fix = 0; next if @ignore_headers && i == 0 @fields.size > (r.size - 1) ? errors << r : process_csv_row(res, errors, r) if block_given? && (i + index_fix) % batch_size == 0 yield [res, errors] res = [] errors = [] end end [res, errors] end private def process_csv_row(result, errors, row) str = row.join('') if defined? FasterCSV require File.join(File.dirname(__FILE__), 'utf_regex_ruby18') UTF8RGX === str ? result << row : errors << row else str = str.force_encoding('utf-8') str.encoding.name == "UTF-8" && str.valid_encoding? ? result << row : errors << row end end def get_attributes(exception) @properties = @data[:attributes] @encoding = @properties[:encoding] || 'UTF-8' raise exception("No support for encodings other than utf-8 or utf-16 at the moment") unless ["utf-8", "utf8", "utf-16", "utf16"].include? @encoding.downcase @field_separator = get_field_separator @quote_character = @properties[:fieldsEnclosedBy] || "" @line_separator = @properties[:linesTerminatedBy] || "\n" @ignore_headers = @properties[:ignoreHeaderLines] ? [1, true].include?(@properties[:ignoreHeaderLines]) : false @file_path = get_file_path raise exception("No file data") unless @file_path @fields = get_fields raise exception("No data fields are found") if @fields.empty? end def get_file_path file = @data[:location] || @data[:attributes][:location] || @data[:files][:location] File.join(@path, file) end def get_fields @data[:field] = [data[:field]] if data[:field].class != Array @data[:field].map {|f| f[:attributes]} end def get_field_separator res = @properties[:fieldsTerminatedBy] || ',' res = "\t" if res == "\\t" res end end end