lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.4.13 vs lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.4.14

- old
+ new

@@ -25,11 +25,11 @@ attr_accessor :verbose attr_reader :error_names, :tree def initialize(dwc_instance, verbose = false) @dwc = dwc_instance - @core = get_fields(@dwc.core) + @core_fields = get_fields(@dwc.core) @extensions = @dwc.extensions.map { |e| [e, get_fields(e)] } @res = {} @parser = ParsleyStore.new(1,2) @verbose = verbose @verbose_count = 10000 @@ -54,11 +54,11 @@ @res end private - def canonical_name(a_scientific_name) + def get_canonical_name(a_scientific_name) if R19 a_scientific_name.force_encoding('utf-8') end begin parsed_name = @parser.parse(a_scientific_name)[:scientificName] @@ -82,42 +82,51 @@ end def add_synonym_from_core(taxon_id, row) taxon = @res[row[taxon_id]] ? @res[row[taxon_id]] : @res[row[taxon_id]] = DarwinCore::TaxonNormalized.new taxon.synonyms << SynonymNormalized.new( - row[@core[:scientificname]], - canonical_name(row[@core[:scientificname]]), - @core[:taxonomicstatus] ? row[@core[:taxonomicstatus]] : nil) + row[@core_fields[:scientificname]], + row[@core_fields[:canonicalname]], + @core_fields[:taxonomicstatus] ? row[@core_fields[:taxonomicstatus]] : nil) end + def set_scientific_name(row, fields) + canonical_name = fields[:scientificnameauthorship] ? row[fields[:scientificname]] : get_canonical_name(row[fields[:scientificname]]) + fields[:canonicalname] = row.size + row << canonical_name + scientific_name = (fields[:scientificnameauthorship] && row[fields[:scientificnameauthorship]].to_s.strip != '') ? row[fields[:scientificname]].strip + ' ' + row[fields[:scientificnameauthorship]].strip : row[fields[:scientificname]].strip + row[fields[:scientificname]] = scientific_name + end + def ingest_core - raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core[:id] && @core[:scientificname]) + raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core_fields[:id] && @core_fields[:scientificname]) puts "Reading core information" if @verbose rows = @dwc.core.read[0] puts "Ingesting information from the core" if @verbose rows.each_with_index do |r, i| count = i + 1 + set_scientific_name(r, @core_fields) puts "Ingesting %s'th record" % count if @verbose and count % @verbose_count == 0 #core has AcceptedNameUsageId - if @core[:acceptednameusageid] && r[@core[:acceptednameusageid]] && r[@core[:acceptednameusageid]] != r[@core[:id]] - add_synonym_from_core(@core[:acceptednameusageid], r) - elsif !@core[:acceptednameusageid] && status_synonym?(r[@core[:taxonomicstatus]]) + if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]] + add_synonym_from_core(@core_fields[:acceptednameusageid], r) + elsif !@core_fields[:acceptednameusageid] && status_synonym?(r[@core_fields[:taxonomicstatus]]) add_synonym_from_core(parent_id, r) else - taxon = @res[r[@core[:id]]] ? @res[r[@core[:id]]] : @res[r[@core[:id]]] = DarwinCore::TaxonNormalized.new - taxon.id = r[@core[:id]] - taxon.current_name = r[@core[:scientificname]] - taxon.current_name_canonical = canonical_name(r[@core[:scientificname]]) + taxon = @res[r[@core_fields[:id]]] ? @res[r[@core_fields[:id]]] : @res[r[@core_fields[:id]]] = DarwinCore::TaxonNormalized.new + taxon.id = r[@core_fields[:id]] + taxon.current_name = r[@core_fields[:scientificname]] + taxon.current_name_canonical = r[@core_fields[:canonicalname]] taxon.parent_id = r[parent_id] - taxon.rank = r[@core[:taxonrank]] if @core[:taxonrank] - taxon.status = r[@core[:taxonomicstatus]] if @core[:taxonomicstatus] + taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank] + taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus] end end end def parent_id - parent_id_field = @core[:highertaxonid] || @core[:parentnameusageid] + parent_id_field = @core_fields[:highertaxonid] || @core_fields[:parentnameusageid] end def calculate_classification_path @res.each do |taxon_id, taxon| next if !taxon.classification_path.empty? @@ -170,13 +179,14 @@ def ingest_synonyms(extension) puts "Ingesting synonyms extension" if @verbose ext, fields = *extension ext.read[0].each_with_index do |r, i| count = i + 1 + set_scientific_name(r, fields) puts "Ingesting %s'th record" % count if @verbose && count % @verbose_count == 0 @res[r[fields[:id]]].synonyms << SynonymNormalized.new( r[fields[:scientificname]], - canonical_name(r[fields[:scientificname]]), + r[fields[:canonicalname]], fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil) end end def ingest_vernaculars(extension)