lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.4.2 vs lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.4.3

- old
+ new

@@ -1,10 +1,10 @@ # encoding: utf-8 -require 'biodiversity' +require 'parsley-store' class DarwinCore - + class TaxonNormalized attr_accessor :id, :parent_id, :classification_path, :current_name, :current_name_canonical, :synonyms, :vernacular_names, :rank, :status def initialize @id = @parent_id = @classification_path = @current_name = @current_name_canonical = @rank = @status = nil @@ -16,16 +16,19 @@ class SynonymNormalized < Struct.new(:name, :canonical_name, :status);end class VernacularNormalized < Struct.new(:name, :language);end class ClassificationNormalizer - def initialize(dwc_instance) + + def initialize(dwc_instance, verbose = false) @dwc = dwc_instance @core = get_fields(@dwc.core) @extensions = @dwc.extensions.map { |e| [e, get_fields(e)] } @res = {} - @parser = ScientificNameParser.new + @parser = ParsleyStore.new(1,2) + @verbose = verbose + @verbose_count = 1000 end def normalize injest_core calculate_classification_path @@ -40,11 +43,11 @@ a_scientific_name.force_encoding('utf-8') end begin parsed_name = @parser.parse(a_scientific_name)[:scientificName] rescue - @parser = ScientificNameParser.new + @parser = ParsleyStore.new(1,2) parsed_name = @parser.parse(a_scientific_name)[:scientificName] end parsed_name[:parsed] ? parsed_name[:canonical] : a_scientific_name end @@ -61,16 +64,21 @@ def add_synonym_from_core(taxon_id, row) taxon = @res[row[taxon_id]] ? @res[row[taxon_id]] : @res[row[taxon_id]] = DarwinCore::TaxonNormalized.new taxon.synonyms << SynonymNormalized.new( row[@core[:scientificname]], canonical_name(row[@core[:scientificname]]), - row[@core[:taxonomicstatus]]) + @core[:taxonomicstatus] ? row[@core[:taxonomicstatus]] : nil) end def injest_core raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core[:id] && @core[:scientificname]) - @dwc.core.read[0].each do |r| + puts "Reading core information" if @verbose + rows = @dwc.core.read[0] + puts "Injesting information from the core" if @verbose + rows.each_with_index do |r, i| + count = i + 1 + puts "Injesting %s'th record" % count if @verbose and count % @verbose_count == 0 #core has AcceptedNameUsageId if @core[:acceptednameusageid] && r[@core[:acceptednameusageid]] && r[@core[:acceptednameusageid]] != r[@core[:id]] add_synonym_from_core(@core[:acceptednameusageid], r) elsif !@core[:acceptednameusageid] && status_synonym?(r[@core[:taxonomicstatus]]) add_synonym_from_core(parent_id, r) @@ -100,11 +108,11 @@ def get_classification_path(taxon) return if taxon.classification_path if DarwinCore.nil_field?(taxon.parent_id) taxon.classification_path = [taxon.current_name_canonical] else - parent_cp = @res[taxon.parent_id].classification_path + parent_cp = @res[taxon.parent_id].classification_path if parent_cp taxon.classification_path = parent_cp + [taxon.current_name_canonical] else get_classification_path(@res[taxon.parent_id]) taxon.classification_path = @res[taxon.parent_id].classification_path + [taxon.current_name_canonical] @@ -119,25 +127,32 @@ injest_vernaculars(e) if fields.keys.include? :vernacularname end end def injest_synonyms(extension) + puts "Injesting synonyms extension" if @verbose ext, fields = *extension - ext.read[0].each do |r| + ext.read[0].each_with_index do |r, i| + count = i + 1 + puts "Injesting %s'th record" % count if @verbose && count % @verbose_count == 0 @res[r[fields[:id]]].synonyms << SynonymNormalized.new( r[fields[:scientificname]], canonical_name(r[fields[:scientificname]]), fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil) end end def injest_vernaculars(extension) + puts "Injesting vernacular names" if @verbose ext, fields = *extension - ext.read[0].each do |r| + ext.read[0].each_with_index do |r, i| + count = i + 1 + puts "Injesting %s'th record" % count if @verbose && count % @verbose_count == 0 @res[r[fields[:id]]].vernacular_names << VernacularNormalized.new( r[fields[:vernacularname]], fields[:languagecode] ? r[fields[:languagecode]] : nil) end end end end +