lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.5.8 vs lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.5.9
- old
+ new
@@ -27,10 +27,11 @@
def initialize(dwc_instance)
@dwc = dwc_instance
@core_fields = get_fields(@dwc.core)
@extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
@normalized_data = {}
+ @synonyms = {}
@parser = ParsleyStore.new(1,2)
@name_strings = {}
@error_names = []
@tree = {}
end
@@ -59,11 +60,10 @@
def get_canonical_name(a_scientific_name)
if R19
a_scientific_name.force_encoding('utf-8')
end
canonical_name = @parser.parse(a_scientific_name, :canonical_only => true)
- add_name_string(canonical_name) unless canonical_name.to_s.empty?
canonical_name.to_s.empty? ? a_scientific_name : canonical_name
end
def get_fields(element)
data = element.fields.inject({}) { |res, f| res[f[:term].split('/')[-1].downcase.to_sym] = f[:index].to_i; res }
@@ -74,15 +74,19 @@
def status_synonym?(status)
status && !!status.match(/^syn/)
end
def add_synonym_from_core(taxon_id, row)
+ @synonyms[row[@core_fields[:id]]] = taxon_id
taxon = @normalized_data[row[taxon_id]] ? @normalized_data[row[taxon_id]] : @normalized_data[row[taxon_id]] = DarwinCore::TaxonNormalized.new
- taxon.synonyms << SynonymNormalized.new(
+ synonym = SynonymNormalized.new(
row[@core_fields[:scientificname]],
row[@core_fields[:canonicalname]],
@core_fields[:taxonomicstatus] ? row[@core_fields[:taxonomicstatus]] : nil)
+ taxon.synonyms << synonym
+ add_name_string(synonym.name)
+ add_name_string(synonym.canonical_name)
end
def set_scientific_name(row, fields)
canonical_name = fields[:scientificnameauthorship] ? row[fields[:scientificname]] : get_canonical_name(row[fields[:scientificname]])
fields[:canonicalname] = row.size
@@ -93,11 +97,10 @@
def ingest_core
raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core_fields[:id] && @core_fields[:scientificname])
@dwc.core.read do |rows|
rows[0].each do |r|
- add_name_string(r[@core_fields[:scientificname]])
set_scientific_name(r, @core_fields)
#core has AcceptedNameUsageId
if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
add_synonym_from_core(@core_fields[:acceptednameusageid], r)
elsif !@core_fields[:acceptednameusageid] && status_synonym?(r[@core_fields[:taxonomicstatus]])
@@ -108,10 +111,12 @@
taxon.current_name = r[@core_fields[:scientificname]]
taxon.current_name_canonical = r[@core_fields[:canonicalname]]
taxon.parent_id = r[parent_id]
taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
+ add_name_string(taxon.current_name)
+ add_name_string(taxon.current_name_canonical)
end
end
end
end
@@ -121,15 +126,12 @@
def calculate_classification_path
@paths_num = 0
@normalized_data.each do |taxon_id, taxon|
next if !taxon.classification_path.empty?
- begin
- get_classification_path(taxon)
- rescue DarwinCore::ParentNotCurrentError
- next
- end
+ res = get_classification_path(taxon)
+ next if res == 'error'
end
end
def get_classification_path(taxon)
@paths_num += 1
@@ -139,28 +141,42 @@
if DarwinCore.nil_field?(taxon.parent_id)
taxon.classification_path << taxon.current_name_canonical
taxon.classification_path_id << taxon.id
@tree.merge!(current_node)
else
- begin
+ parent_cp = nil
+ if @normalized_data[taxon.parent_id]
parent_cp = @normalized_data[taxon.parent_id].classification_path
- rescue NoMethodError #name has a parent which is not a current name
- error = "The parent of the taxon \'#{taxon.current_name}\' is deprecated"
- @error_names << {:name => taxon, :error => error}
- raise DarwinCore::ParentNotCurrentError, error
+ else
+ current_parent = @normalized_data[@synonyms[taxon.parent_id]]
+ if current_parent
+ error = "WARNING: The parent of the taxon \'#{taxon.current_name}\' is deprecated"
+ @error_names << {:name => taxon, :error => :deprecated_parent, :current_parent => current_parent }
+ parent_cp = current_parent.classification_path
+ else
+ error = "WARNING: The parent of the taxon \'#{taxon.current_name}\' not found"
+ @error_names << {:name => taxon, :error => :deprecated_parent, :current_parent => nil}
+ end
end
+ return 'error' unless parent_cp
if parent_cp.empty?
- get_classification_path(@normalized_data[taxon.parent_id])
+ res = get_classification_path(@normalized_data[taxon.parent_id])
+ return res if res == 'error'
taxon.classification_path += @normalized_data[taxon.parent_id].classification_path + [taxon.current_name_canonical]
taxon.classification_path_id += @normalized_data[taxon.parent_id].classification_path_id + [taxon.id]
parent_node = @normalized_data[taxon.parent_id].classification_path_id.inject(@tree) {|node, id| node[id]}
parent_node.merge!(current_node)
else
taxon.classification_path += parent_cp + [taxon.current_name_canonical]
taxon.classification_path_id += @normalized_data[taxon.parent_id].classification_path_id + [taxon.id]
parent_node = @normalized_data[taxon.parent_id].classification_path_id.inject(@tree) {|node, id| node[id]}
- parent_node.merge!(current_node)
+ begin
+ parent_node.merge!(current_node)
+ rescue NoMethodError => e
+ DarwinCore.logger_write(@dwc.object_id, "Error '%s' taxon %s" % [e.message, taxon.id])
+ return 'error'
+ end
end
end
end
def ingest_extensions
@@ -174,31 +190,34 @@
def ingest_synonyms(extension)
DarwinCore.logger_write(@dwc.object_id, "Ingesting synonyms extension")
ext, fields = *extension
ext.read do |rows|
rows[0].each do |r|
- add_name_string(r[fields[:scientificname]])
set_scientific_name(r, fields)
- @normalized_data[r[fields[:id]]].synonyms << SynonymNormalized.new(
+ synonym = SynonymNormalized.new(
r[fields[:scientificname]],
r[fields[:canonicalname]],
fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil)
+ @normalized_data[r[fields[:id]]].synonyms << synonym
+ add_name_string(synonym.name)
+ add_name_string(synonym.canonical_name)
end
end
end
def ingest_vernaculars(extension)
DarwinCore.logger_write(@dwc.object_id, "Ingesting vernacular names extension")
ext, fields = *extension
ext.read do |rows|
rows[0].each do |r|
- add_name_string(r[fields[:vernacularname]])
- @normalized_data[r[fields[:id]]].vernacular_names << VernacularNormalized.new(
+ vernacular = VernacularNormalized.new(
r[fields[:vernacularname]],
fields[:languagecode] ? r[fields[:languagecode]] : nil)
- add_name_string(r[fields[:vernacularname]])
+ @normalized_data[r[fields[:id]]].vernacular_names << vernacular
+ add_name_string(vernacular.name)
end
end
end
end
end
+