Sha256: f27ee6475d6ad3213cfd2bac49afdf758e96f3ea00255938ff6fd4c47fe223b2

Contents?: true

Size: 1.44 KB

Versions: 1

Compression:

Stored size: 1.44 KB

Contents

class DarwinCore
  module Ingester
    attr_reader :data, :properties, :encoding, :fields_separator
    attr_reader :file_path, :fields, :line_separator, :quote_character, :ignore_headers
    def read(batch_size = 10000)
      res = []
      errors = []
      index_fix = 1
      args = {:col_sep => @field_separator}
      args.merge!({:quote_char => @quote_character}) if @quote_character != ''
      CSV.open(@file_path, args).each_with_index do |r, i|
        index_fix = 0; next if @ignore_headers && i == 0
        str = r.join('')
        if defined? FasterCSV
          require File.join(File.dirname(__FILE__), 'utf_regex_ruby18')
          UTF8RGX === str ? res << r : errors << r
        else
          str = str.force_encoding('utf-8')
          str.encoding.name == "UTF-8" && str.valid_encoding? ? res << r : errors << r
        end
        if block_given? && (i + index_fix) % batch_size == 0
          yield [res, errors]
          res = []
          errors = []
        end
      end
      [res, errors]
    end
    
    private
    def get_file_path
      file = @data[:location] || @data[:attributes][:location] || @data[:files][:location]
      File.join(@path, file)
    end

    def get_fields
      @data[:field] = [data[:field]] if data[:field].class != Array
      @data[:field].map {|f| f[:attributes]}
    end

    def get_field_separator
      res = @properties[:fieldsTerminatedBy] || ','
      res = "\t" if res == "\\t"
      res
    end
  end
end

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
dwc-archive-0.2.1 lib/dwc-archive/ingester.rb