lib/dwc_archive/ingester.rb in dwc-archive-1.1.3 vs lib/dwc_archive/ingester.rb in dwc-archive-1.1.4

- old
+ new

@@ -1,33 +1,34 @@ -# encoding: utf-8 class DarwinCore # This module abstracts information for reading csv file to be used # in several classes which need such functionality module Ingester - attr_reader :data, :properties, :encoding, :fields_separator, :size - attr_reader :file_path, :fields, :line_separator, :quote_character, - :ignore_headers + attr_reader :data, :properties, :encoding, :fields_separator, :size, :file_path, :fields, :line_separator, + :quote_character, :ignore_headers def size @size ||= init_size end def read(batch_size = 10_000) DarwinCore.logger_write(@dwc.object_id, "Reading #{name} data") res = [] errors = [] args = define_csv_args - min_size = @fields.map { |f| f[:index].to_i || 0 }.sort[-1] + 1 - csv = CSV.new(open(@file_path), args) + min_size = @fields.map { |f| f[:index].to_i || 0 }.max + 1 + csv = CSV.new(open(@file_path), **args) csv.each_with_index do |r, i| next if @ignore_headers && i == 0 + min_size > r.size ? errors << r : process_csv_row(res, errors, r) next if i == 0 || i % batch_size != 0 + DarwinCore.logger_write(@dwc.object_id, format("Ingested %s records from %s", i, name)) next unless block_given? + yield [res, errors] res = [] errors = [] end yield [res, errors] if block_given? @@ -68,39 +69,43 @@ init_fields end def init_encoding @encoding = @properties[:encoding] || "UTF-8" - accepted_encoding = ["utf-8", "utf8", "utf-16", "utf16"]. + accepted_encoding = %w[utf-8 utf8 utf-16 utf16]. include?(@encoding.downcase) - fail( - DarwinCore::EncodingError, - "No support for encodings other than utf-8 or utf-16 at the moment" - ) unless accepted_encoding + unless accepted_encoding + raise( + DarwinCore::EncodingError, + "No support for encodings other than utf-8 or utf-16 at the moment" + ) + end end def init_file_path file = @data[:location] || @data[:attributes][:location] || @data[:files][:location] @file_path = File.join(@path, file) - fail DarwinCore::FileNotFoundError, "No file data" unless @file_path + raise DarwinCore::FileNotFoundError, "No file data" unless @file_path end def init_fields @data[:field] = [data[:field]] if data[:field].class != Array @fields = @data[:field].map { |f| f[:attributes] } - fail DarwinCore::InvalidArchiveError, - "No data fields are found" if @fields.empty? + if @fields.empty? + raise DarwinCore::InvalidArchiveError, + "No data fields are found" + end end def init_field_separator res = @properties[:fieldsTerminatedBy] || "," res = "\t" if res == "\\t" res end def init_size - `wc -l #{@file_path}`.match(/^\s*([\d]+)\s/)[1].to_i + `wc -l #{@file_path}`.match(/^\s*(\d+)\s/)[1].to_i end end end