lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.4.10 vs lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.4.11

- old
+ new

@@ -2,17 +2,18 @@ require 'parsley-store' class DarwinCore class TaxonNormalized - attr_accessor :id, :parent_id, :classification_path, :current_name, :current_name_canonical, :synonyms, :vernacular_names, :rank, :status + attr_accessor :id, :parent_id, :classification_path_id, :classification_path, :current_name, :current_name_canonical, :synonyms, :vernacular_names, :rank, :status def initialize @id = @parent_id = @rank = @status = nil @current_name = '' @current_name_canonical = '' @classification_path = [] + @classification_path_id = [] @synonyms = [] @vernacular_names = [] end end @@ -20,23 +21,33 @@ class SynonymNormalized < Struct.new(:name, :canonical_name, :status);end class VernacularNormalized < Struct.new(:name, :language);end class ClassificationNormalizer attr_accessor :verbose - attr_reader :error_names + attr_reader :error_names, :tree def initialize(dwc_instance, verbose = false) @dwc = dwc_instance @core = get_fields(@dwc.core) @extensions = @dwc.extensions.map { |e| [e, get_fields(e)] } @res = {} @parser = ParsleyStore.new(1,2) @verbose = verbose @verbose_count = 10000 + @name_strings = {} @error_names = [] + @tree = {} end + def add_name_string(name_string) + @name_strings[name_string] = 1 unless @name_strings[name_string] + end + + def name_strings + @name_strings.keys + end + def normalize @res = {} ingest_core calculate_classification_path ingest_extensions @@ -53,10 +64,12 @@ parsed_name = @parser.parse(a_scientific_name)[:scientificName] rescue @parser = ParsleyStore.new(1,2) parsed_name = @parser.parse(a_scientific_name)[:scientificName] end + add_name_string(a_scientific_name) + add_name_string(parsed_name[:canonical]) if parsed_name[:parsed] parsed_name[:parsed] ? parsed_name[:canonical] : a_scientific_name end def get_fields(element) data = element.fields.inject({}) { |res, f| res[f[:term].split('/')[-1].downcase.to_sym] = f[:index].to_i; res } @@ -116,12 +129,15 @@ end end def get_classification_path(taxon) return if !taxon.classification_path.empty? + current_node = {taxon.id => {}} if DarwinCore.nil_field?(taxon.parent_id) taxon.classification_path << taxon.current_name_canonical + taxon.classification_path_id << taxon.id + @tree.merge!(current_node) else begin parent_cp = @res[taxon.parent_id].classification_path rescue NoMethodError #name has a parent which is not a current name error = "The parent of the taxon \'#{taxon.current_name}\' is deprecated" @@ -129,12 +145,18 @@ raise DarwinCore::ParentNotCurrentError, error end if parent_cp.empty? get_classification_path(@res[taxon.parent_id]) taxon.classification_path += @res[taxon.parent_id].classification_path + [taxon.current_name_canonical] + taxon.classification_path_id += @res[taxon.parent_id].classification_path_id + [taxon.id] + parent_node = @res[taxon.parent_id].classification_path_id.inject(@tree) {|node, id| node[id]} + parent_node.merge!(current_node) else taxon.classification_path += parent_cp + [taxon.current_name_canonical] + taxon.classification_path_id += @res[taxon.parent_id].classification_path_id + [taxon.id] + parent_node = @res[taxon.parent_id].classification_path_id.inject(@tree) {|node, id| node[id]} + parent_node.merge!(current_node) end end end def ingest_extensions @@ -165,9 +187,10 @@ count = i + 1 puts "Ingesting %s'th record" % count if @verbose && count % @verbose_count == 0 @res[r[fields[:id]]].vernacular_names << VernacularNormalized.new( r[fields[:vernacularname]], fields[:languagecode] ? r[fields[:languagecode]] : nil) + add_name_string(r[fields[:vernacularname]]) end end end end