lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.5.1 vs lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.5.2

- old
+ new

@@ -20,21 +20,18 @@ class SynonymNormalized < Struct.new(:name, :canonical_name, :status);end class VernacularNormalized < Struct.new(:name, :language);end class ClassificationNormalizer - attr_accessor :verbose attr_reader :error_names, :tree - def initialize(dwc_instance, verbose = false) + def initialize(dwc_instance) @dwc = dwc_instance @core_fields = get_fields(@dwc.core) @extensions = @dwc.extensions.map { |e| [e, get_fields(e)] } @res = {} @parser = ParsleyStore.new(1,2) - @verbose = verbose - @verbose_count = 10000 @name_strings = {} @error_names = [] @tree = {} end @@ -45,13 +42,16 @@ def name_strings @name_strings.keys end def normalize + DarwinCore.logger_write(@dwc.object_id, "Started normalization of the classification") @res = {} ingest_core + DarwinCore.logger_write(@dwc.object_id, "Calculating the classification parent/child paths") calculate_classification_path + DarwinCore.logger_write(@dwc.object_id, "Ingesting data from extensions") ingest_extensions @res end private @@ -92,30 +92,27 @@ row[fields[:scientificname]] = scientific_name end def ingest_core raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core_fields[:id] && @core_fields[:scientificname]) - puts "Reading core information" if @verbose - rows = @dwc.core.read[0] - puts "Ingesting information from the core" if @verbose - rows.each_with_index do |r, i| - count = i + 1 - set_scientific_name(r, @core_fields) - puts "Ingesting %s'th record" % count if @verbose and count % @verbose_count == 0 - #core has AcceptedNameUsageId - if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]] - add_synonym_from_core(@core_fields[:acceptednameusageid], r) - elsif !@core_fields[:acceptednameusageid] && status_synonym?(r[@core_fields[:taxonomicstatus]]) - add_synonym_from_core(parent_id, r) - else - taxon = @res[r[@core_fields[:id]]] ? @res[r[@core_fields[:id]]] : @res[r[@core_fields[:id]]] = DarwinCore::TaxonNormalized.new - taxon.id = r[@core_fields[:id]] - taxon.current_name = r[@core_fields[:scientificname]] - taxon.current_name_canonical = r[@core_fields[:canonicalname]] - taxon.parent_id = r[parent_id] - taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank] - taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus] + @dwc.core.read do |rows| + rows[0].each do |r| + set_scientific_name(r, @core_fields) + #core has AcceptedNameUsageId + if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]] + add_synonym_from_core(@core_fields[:acceptednameusageid], r) + elsif !@core_fields[:acceptednameusageid] && status_synonym?(r[@core_fields[:taxonomicstatus]]) + add_synonym_from_core(parent_id, r) + else + taxon = @res[r[@core_fields[:id]]] ? @res[r[@core_fields[:id]]] : @res[r[@core_fields[:id]]] = DarwinCore::TaxonNormalized.new + taxon.id = r[@core_fields[:id]] + taxon.current_name = r[@core_fields[:scientificname]] + taxon.current_name_canonical = r[@core_fields[:canonicalname]] + taxon.parent_id = r[parent_id] + taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank] + taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus] + end end end end def parent_id @@ -170,35 +167,33 @@ ingest_vernaculars(e) if fields.keys.include? :vernacularname end end def ingest_synonyms(extension) - puts "Ingesting synonyms extension" if @verbose + DarwinCore.logger_write(@dwc.object_id, "Ingesting synonyms extension") ext, fields = *extension - ext.read[0].each_with_index do |r, i| - count = i + 1 - set_scientific_name(r, fields) - puts "Ingesting %s'th record" % count if @verbose && count % @verbose_count == 0 - @res[r[fields[:id]]].synonyms << SynonymNormalized.new( - r[fields[:scientificname]], - r[fields[:canonicalname]], - fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil) + ext.read do |rows| + rows[0].each do |r| + set_scientific_name(r, fields) + @res[r[fields[:id]]].synonyms << SynonymNormalized.new( + r[fields[:scientificname]], + r[fields[:canonicalname]], + fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil) + end end end def ingest_vernaculars(extension) - puts "Ingesting vernacular names" if @verbose + DarwinCore.logger_write(@dwc.object_id, "Ingesting vernacular names extension") ext, fields = *extension - ext.read[0].each_with_index do |r, i| - count = i + 1 - puts "Ingesting %s'th record" % count if @verbose && count % @verbose_count == 0 - @res[r[fields[:id]]].vernacular_names << VernacularNormalized.new( - r[fields[:vernacularname]], - fields[:languagecode] ? r[fields[:languagecode]] : nil) - add_name_string(r[fields[:vernacularname]]) + ext.read do |rows| + rows[0].each do |r| + @res[r[fields[:id]]].vernacular_names << VernacularNormalized.new( + r[fields[:vernacularname]], + fields[:languagecode] ? r[fields[:languagecode]] : nil) + add_name_string(r[fields[:vernacularname]]) + end end end end end - -