lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.4.10 vs lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.4.11
- old
+ new
@@ -2,17 +2,18 @@
require 'parsley-store'
class DarwinCore
class TaxonNormalized
- attr_accessor :id, :parent_id, :classification_path, :current_name, :current_name_canonical, :synonyms, :vernacular_names, :rank, :status
+ attr_accessor :id, :parent_id, :classification_path_id, :classification_path, :current_name, :current_name_canonical, :synonyms, :vernacular_names, :rank, :status
def initialize
@id = @parent_id = @rank = @status = nil
@current_name = ''
@current_name_canonical = ''
@classification_path = []
+ @classification_path_id = []
@synonyms = []
@vernacular_names = []
end
end
@@ -20,23 +21,33 @@
class SynonymNormalized < Struct.new(:name, :canonical_name, :status);end
class VernacularNormalized < Struct.new(:name, :language);end
class ClassificationNormalizer
attr_accessor :verbose
- attr_reader :error_names
+ attr_reader :error_names, :tree
def initialize(dwc_instance, verbose = false)
@dwc = dwc_instance
@core = get_fields(@dwc.core)
@extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
@res = {}
@parser = ParsleyStore.new(1,2)
@verbose = verbose
@verbose_count = 10000
+ @name_strings = {}
@error_names = []
+ @tree = {}
end
+ def add_name_string(name_string)
+ @name_strings[name_string] = 1 unless @name_strings[name_string]
+ end
+
+ def name_strings
+ @name_strings.keys
+ end
+
def normalize
@res = {}
ingest_core
calculate_classification_path
ingest_extensions
@@ -53,10 +64,12 @@
parsed_name = @parser.parse(a_scientific_name)[:scientificName]
rescue
@parser = ParsleyStore.new(1,2)
parsed_name = @parser.parse(a_scientific_name)[:scientificName]
end
+ add_name_string(a_scientific_name)
+ add_name_string(parsed_name[:canonical]) if parsed_name[:parsed]
parsed_name[:parsed] ? parsed_name[:canonical] : a_scientific_name
end
def get_fields(element)
data = element.fields.inject({}) { |res, f| res[f[:term].split('/')[-1].downcase.to_sym] = f[:index].to_i; res }
@@ -116,12 +129,15 @@
end
end
def get_classification_path(taxon)
return if !taxon.classification_path.empty?
+ current_node = {taxon.id => {}}
if DarwinCore.nil_field?(taxon.parent_id)
taxon.classification_path << taxon.current_name_canonical
+ taxon.classification_path_id << taxon.id
+ @tree.merge!(current_node)
else
begin
parent_cp = @res[taxon.parent_id].classification_path
rescue NoMethodError #name has a parent which is not a current name
error = "The parent of the taxon \'#{taxon.current_name}\' is deprecated"
@@ -129,12 +145,18 @@
raise DarwinCore::ParentNotCurrentError, error
end
if parent_cp.empty?
get_classification_path(@res[taxon.parent_id])
taxon.classification_path += @res[taxon.parent_id].classification_path + [taxon.current_name_canonical]
+ taxon.classification_path_id += @res[taxon.parent_id].classification_path_id + [taxon.id]
+ parent_node = @res[taxon.parent_id].classification_path_id.inject(@tree) {|node, id| node[id]}
+ parent_node.merge!(current_node)
else
taxon.classification_path += parent_cp + [taxon.current_name_canonical]
+ taxon.classification_path_id += @res[taxon.parent_id].classification_path_id + [taxon.id]
+ parent_node = @res[taxon.parent_id].classification_path_id.inject(@tree) {|node, id| node[id]}
+ parent_node.merge!(current_node)
end
end
end
def ingest_extensions
@@ -165,9 +187,10 @@
count = i + 1
puts "Ingesting %s'th record" % count if @verbose && count % @verbose_count == 0
@res[r[fields[:id]]].vernacular_names << VernacularNormalized.new(
r[fields[:vernacularname]],
fields[:languagecode] ? r[fields[:languagecode]] : nil)
+ add_name_string(r[fields[:vernacularname]])
end
end
end
end