lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.9.5 vs lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.9.6
- old
+ new
@@ -2,11 +2,14 @@
require 'parsley-store'
class DarwinCore
class TaxonNormalized
- attr_accessor :id, :local_id, :global_id, :source, :parent_id, :classification_path_id, :classification_path, :linnean_classification_path, :current_name, :current_name_canonical, :synonyms, :vernacular_names, :rank, :status
+ attr_accessor :id, :local_id, :global_id, :source, :parent_id,
+ :classification_path_id, :classification_path,
+ :linnean_classification_path, :current_name, :current_name_canonical,
+ :synonyms, :vernacular_names, :rank, :status
def initialize
@id = @parent_id = @rank = @status = nil
@current_name = ''
@current_name_canonical = ''
@@ -20,13 +23,26 @@
@linnean_classification_path = []
end
end
- class SynonymNormalized < Struct.new(:id, :name, :canonical_name, :status, :source, :local_id, :global_id);end
- class VernacularNormalized < Struct.new(:name, :language, :locality, :country_code);end
+ class GnubTaxon < TaxonNormalized
+ attr_accessor :uuid, :uuid_path
+ def initialize
+ super
+ @uuid = nil
+ @uuid_path = []
+ end
+ end
+
+ class SynonymNormalized < Struct.new(:id, :name, :canonical_name,
+ :status, :source, :local_id,
+ :global_id);end
+ class VernacularNormalized < Struct.new(:name, :language, :locality,
+ :country_code);end
+
class ClassificationNormalizer
attr_reader :error_names, :tree, :normalized_data
def initialize(dwc_instance)
@dwc = dwc_instance
@@ -44,11 +60,13 @@
def add_name_string(name_string)
@name_strings[name_string] = 1 unless @name_strings[name_string]
end
def add_vernacular_name_string(name_string)
- @vernacular_name_strings[name_string] = 1 unless @vernacular_name_strings[name_string]
+ unless @vernacular_name_strings[name_string]
+ @vernacular_name_strings[name_string] = 1
+ end
end
def name_strings(opts = {})
opts = { with_hash: false }.merge(opts)
if !!opts[:with_hash]
@@ -66,38 +84,44 @@
@vernacular_name_strings.keys
end
end
def normalize(opts = {})
- opts = { :with_canonical_names => true, :with_extensions => true }.merge(opts)
+ opts = { :with_canonical_names => true,
+ :with_extensions => true }.merge(opts)
@with_canonical_names = !!opts[:with_canonical_names]
- DarwinCore.logger_write(@dwc.object_id, "Started normalization of the classification")
+ DarwinCore.logger_write(@dwc.object_id,
+ 'Started normalization of the classification')
ingest_core
- DarwinCore.logger_write(@dwc.object_id, "Calculating the classification parent/child paths")
- has_parent_id? ? calculate_classification_path : @normalized_data.keys.each { |id| @tree[id] = {} }
- DarwinCore.logger_write(@dwc.object_id, "Ingesting data from extensions")
+ DarwinCore.logger_write(@dwc.object_id,
+ 'Calculating the classification parent/child paths')
+ has_parent_id? ?
+ calculate_classification_path :
+ @normalized_data.keys.each { |id| @tree[id] = {} }
+ DarwinCore.logger_write(@dwc.object_id, 'Ingesting data from extensions')
if !!opts[:with_extensions]
ingest_extensions
end
@normalized_data
end
private
def get_canonical_name(a_scientific_name)
if @with_canonical_names
- canonical_name = @parser.parse(a_scientific_name, :canonical_only => true)
+ canonical_name = @parser.parse(a_scientific_name,
+ :canonical_only => true)
canonical_name.to_s.empty? ? a_scientific_name : canonical_name
else
nil
end
end
def get_fields(element)
- data = element.fields.inject({}) do |res, f|
+ data = element.fields.inject({}) do |res, f|
field = f[:term].split('/')[-1]
- field = field ? field.downcase.to_sym : ''
+ field = field ? field.downcase.to_sym : ''
res[field] = f[:index].to_i
res
end
data[:id] = element.id[:index] if element.id
data
@@ -107,16 +131,20 @@
status && !!status.match(/^syn/)
end
def add_synonym_from_core(taxon_id, row)
@synonyms[row[@core_fields[:id]]] = taxon_id
- taxon = @normalized_data[row[taxon_id]] ? @normalized_data[row[taxon_id]] : @normalized_data[row[taxon_id]] = DarwinCore::TaxonNormalized.new
+ taxon = @normalized_data[row[taxon_id]] ?
+ @normalized_data[row[taxon_id]] :
+ @normalized_data[row[taxon_id]] = DarwinCore::TaxonNormalized.new
synonym = SynonymNormalized.new(
row[@core_fields[:id]],
row[@core_fields[:scientificname]],
row[@core_fields[:canonicalname]],
- @core_fields[:taxonomicstatus] ? row[@core_fields[:taxonomicstatus]] : nil,
+ @core_fields[:taxonomicstatus] ?
+ row[@core_fields[:taxonomicstatus]] :
+ nil,
@core_fields[:source] ? row[@core_fields[:source]] : nil,
@core_fields[:localid] ? row[@core_fields[:localid]] : nil,
@core_fields[:globalid] ? row[@core_fields[:globalid]] : nil,
)
taxon.synonyms << synonym
@@ -127,14 +155,18 @@
def set_scientific_name(row, fields)
row[fields[:scientificname]] = 'N/A' unless row[fields[:scientificname]]
canonical_name = nil
scientific_name = row[fields[:scientificname]].strip
if separate_canonical_and_authorship?(row, fields)
- canonical_name = row[fields[:scientificname]].strip if @with_canonical_names
+ if @with_canonical_names
+ canonical_name = row[fields[:scientificname]].strip
+ end
scientific_name += " #{row[fields[:scientificnameauthorship]].strip}"
else
- canonical_name = get_canonical_name(row[fields[:scientificname]]) if @with_canonical_names
+ if @with_canonical_names
+ canonical_name = get_canonical_name(row[fields[:scientificname]])
+ end
end
fields[:canonicalname] = row.size
row << canonical_name
row[fields[:scientificname]] = scientific_name
end
@@ -147,47 +179,76 @@
!(authorship.empty? || row[fields[:scientificname]].index(authorship))
end
def ingest_core
@normalized_data = {}
- raise DarwinCore::CoreFileError.new("Darwin Core core fields must contain taxon id and scientific name") unless (@core_fields[:id] && @core_fields[:scientificname])
+ has_name_and_id = @core_fields[:id] && @core_fields[:scientificname]
+ raise DarwinCore::CoreFileError.new('Darwin Core core fields must ' +
+ 'contain taxon id and scientific name') unless has_name_and_id
@dwc.core.read do |rows|
rows[1].each do |error|
- @error_names << { :data => error, :error => :reading_or_encoding_error }
+ @error_names << { :data => error,
+ :error => :reading_or_encoding_error }
end
rows[0].each do |r|
set_scientific_name(r, @core_fields)
#core has AcceptedNameUsageId
- if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
+ if @core_fields[:acceptednameusageid] &&
+ r[@core_fields[:acceptednameusageid]] &&
+ r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
add_synonym_from_core(@core_fields[:acceptednameusageid], r)
- elsif !@core_fields[:acceptednameusageid] && @core_fields[:taxonomicstatus] && status_synonym?(r[@core_fields[:taxonomicstatus]])
+ elsif !@core_fields[:acceptednameusageid] &&
+ @core_fields[:taxonomicstatus] &&
+ status_synonym?(r[@core_fields[:taxonomicstatus]])
add_synonym_from_core(parent_id, r) if has_parent_id?
else
- taxon = @normalized_data[r[@core_fields[:id]]] ? @normalized_data[r[@core_fields[:id]]] : @normalized_data[r[@core_fields[:id]]] = DarwinCore::TaxonNormalized.new
+ unless @normalized_data[r[@core_fields[:id]]]
+ if gnub_archive?
+ new_taxon = DarwinCore::GnubTaxon.new
+ else
+ new_taxon = DarwinCore::TaxonNormalized.new
+ end
+ @normalized_data[r[@core_fields[:id]]] = new_taxon
+ end
+ taxon = @normalized_data[r[@core_fields[:id]]]
+ if gnub_archive?
+ taxon.uuid = r[@core_fields[:originalnameusageid]]
+ taxon.uuid_path = r[@core_fields[:originalnameusageidpath]].
+ split('|')
+ end
taxon.id = r[@core_fields[:id]]
taxon.current_name = r[@core_fields[:scientificname]]
taxon.current_name_canonical = r[@core_fields[:canonicalname]]
taxon.parent_id = has_parent_id? ? r[parent_id] : nil
taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
- taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
+ if @core_fields[:taxonomicstatus]
+ taxon.status = r[@core_fields[:taxonomicstatus]]
+ end
taxon.source = r[@core_fields[:source]] if @core_fields[:source]
taxon.local_id = r[@core_fields[:localid]] if @core_fields[:localid]
- taxon.global_id = r[@core_fields[:globalid]] if @core_fields[:globalid]
- taxon.linnean_classification_path = get_linnean_classification_path(r, taxon)
+ if @core_fields[:globalid]
+ taxon.global_id = r[@core_fields[:globalid]]
+ end
+ taxon.linnean_classification_path =
+ get_linnean_classification_path(r, taxon)
add_name_string(taxon.current_name)
- add_name_string(taxon.current_name_canonical) if taxon.current_name_canonical && !taxon.current_name_canonical.empty?
+ has_canonical = taxon.current_name_canonical &&
+ !taxon.current_name_canonical.empty?
+ add_name_string(taxon.current_name_canonical) if has_canonical
end
end
end
end
def has_parent_id?
- @has_parent_id ||= @core_fields.has_key?(:highertaxonid) || @core_fields.has_key?(:parentnameusageid)
+ @has_parent_id ||= @core_fields.has_key?(:highertaxonid) ||
+ @core_fields.has_key?(:parentnameusageid)
end
def parent_id
- parent_id_field = @core_fields[:highertaxonid] || @core_fields[:parentnameusageid]
+ parent_id_field = @core_fields[:highertaxonid] ||
+ @core_fields[:parentnameusageid]
end
def calculate_classification_path
@paths_num = 0
@normalized_data.each do |taxon_id, taxon|
@@ -198,73 +259,97 @@
end
def get_classification_path(taxon)
return if !taxon.classification_path_id.empty?
@paths_num += 1
- DarwinCore.logger_write(@dwc.object_id, "Calculated %s paths" % @paths_num) if @paths_num % 10000 == 0
+ if @paths_num % 10000 == 0
+ DarwinCore.logger_write(@dwc.object_id,
+ "Calculated %s paths" % @paths_num)
+ end
current_node = {taxon.id => {}}
if DarwinCore.nil_field?(taxon.parent_id)
- taxon.classification_path << taxon.current_name_canonical if @with_canonical_names
+ if @with_canonical_names
+ taxon.classification_path << taxon.current_name_canonical
+ end
taxon.classification_path_id << taxon.id
@tree.merge!(current_node)
else
parent_cp = parent_cpid = nil
if @normalized_data[taxon.parent_id]
- parent_cp = @normalized_data[taxon.parent_id].classification_path if @with_canonical_names
- parent_cpid = @normalized_data[taxon.parent_id].classification_path_id
+ if @with_canonical_names
+ parent_cp = @normalized_data[taxon.parent_id].classification_path
+ end
+ parent_cpid = @normalized_data[taxon.parent_id].
+ classification_path_id
else
current_parent = @normalized_data[@synonyms[taxon.parent_id]]
if current_parent
- error = "WARNING: The parent of the taxon \'#{taxon.current_name}\' is deprecated"
- @error_names << {:data => taxon, :error => :deprecated_parent, :current_parent => current_parent }
+ error = 'WARNING: The parent of the taxon ' +
+ "\'#{taxon.current_name}\' is deprecated"
+ @error_names << {:data => taxon,
+ :error => :deprecated_parent,
+ :current_parent => current_parent }
- parent_cp = current_parent.classification_path if @with_canonical_names
+ if @with_canonical_names
+ parent_cp = current_parent.classification_path
+ end
parent_cpid = current_parent.classification_path_id
else
- error = "WARNING: The parent of the taxon \'#{taxon.current_name}\' not found"
- @error_names << {:data => taxon, :error => :deprecated_parent, :current_parent => nil}
+ error = 'WARNING: The parent of the taxon ' +
+ "\'#{taxon.current_name}\' not found"
+ @error_names << {:data => taxon,
+ :error => :deprecated_parent, :current_parent => nil}
end
end
return 'error' unless parent_cpid
if parent_cpid.empty?
res = 'error'
begin
res = get_classification_path(@normalized_data[taxon.parent_id])
rescue SystemStackError
- @error_names << {:data => taxon, :error => :too_deep_hierarchy, :current_parent => nil}
+ @error_names << {:data => taxon,
+ :error => :too_deep_hierarchy, :current_parent => nil}
end
return res if res == 'error'
if @with_canonical_names
- taxon.classification_path += @normalized_data[taxon.parent_id].classification_path + [taxon.current_name_canonical]
+ taxon.classification_path += @normalized_data[taxon.parent_id].
+ classification_path + [taxon.current_name_canonical]
end
- taxon.classification_path_id += @normalized_data[taxon.parent_id].classification_path_id + [taxon.id]
- parent_node = @normalized_data[taxon.parent_id].classification_path_id.inject(@tree) {|node, id| node[id]}
+ taxon.classification_path_id += @normalized_data[taxon.parent_id].
+ classification_path_id + [taxon.id]
+ parent_node = @normalized_data[taxon.parent_id].
+ classification_path_id.inject(@tree) {|node, id| node[id]}
parent_node.merge!(current_node)
else
- taxon.classification_path += parent_cp + [taxon.current_name_canonical] if @with_canonical_names
+ taxon.classification_path += parent_cp +
+ [taxon.current_name_canonical] if @with_canonical_names
taxon.classification_path_id += parent_cpid + [taxon.id]
- parent_node = @normalized_data[taxon.parent_id].classification_path_id.inject(@tree) {|node, id| node[id]}
+ parent_node = @normalized_data[taxon.parent_id].
+ classification_path_id.inject(@tree) {|node, id| node[id]}
begin
parent_node.merge!(current_node)
rescue NoMethodError => e
- DarwinCore.logger_write(@dwc.object_id, "Error '%s' taxon %s" % [e.message, taxon.id])
+ DarwinCore.logger_write(@dwc.object_id,
+ "Error '%s' taxon %s" % [e.message, taxon.id])
return 'error'
end
end
end
end
def ingest_extensions
@extensions.each do |e|
ext, fields = *e
- ingest_synonyms(e) if (File.split(e[0].file_path).last.match(/synonym/i) && fields.keys.include?(:scientificname))
+ ingest_synonyms(e) if (File.split(e[0].file_path).
+ last.match(/synonym/i) &&
+ fields.keys.include?(:scientificname))
ingest_vernaculars(e) if fields.keys.include? :vernacularname
end
end
def ingest_synonyms(extension)
- DarwinCore.logger_write(@dwc.object_id, "Ingesting synonyms extension")
+ DarwinCore.logger_write(@dwc.object_id, 'Ingesting synonyms extension')
ext, fields = *extension
ext.read do |rows|
rows[0].each do |r|
set_scientific_name(r, fields)
synonym = SynonymNormalized.new(
@@ -279,18 +364,20 @@
if @normalized_data[r[fields[:id]]]
@normalized_data[r[fields[:id]]].synonyms << synonym
add_name_string(synonym.name)
add_name_string(synonym.canonical_name)
else
- @error_names << { :taxon => synonym, :error => :synonym_of_unknown_taxa }
+ @error_names << { :taxon => synonym,
+ :error => :synonym_of_unknown_taxa }
end
end
end
end
def ingest_vernaculars(extension)
- DarwinCore.logger_write(@dwc.object_id, "Ingesting vernacular names extension")
+ DarwinCore.logger_write(@dwc.object_id,
+ 'Ingesting vernacular names extension')
ext, fields = *extension
ext.read do |rows|
rows[0].each do |r|
language = nil
@@ -311,22 +398,27 @@
country_code)
if @normalized_data[r[fields[:id]]]
@normalized_data[r[fields[:id]]].vernacular_names << vernacular
add_vernacular_name_string(vernacular.name)
else
- @error_names << { :vernacular_name => vernacular, :error => :vernacular_of_unknown_taxa }
+ @error_names << { :vernacular_name => vernacular,
+ :error => :vernacular_of_unknown_taxa }
end
end
end
end
-
+
#Collect linnean classification path only on species level
def get_linnean_classification_path(row, taxon)
res = []
- [:kingdom, :phylum, :class, :order, :family, :genus, :subgenus].each do |clade|
- res << [row[@core_fields[clade]], clade] if @core_fields[clade]
+ [:kingdom, :phylum, :class,
+ :order, :family, :genus, :subgenus].each do |clade|
+ res << [row[@core_fields[clade]], clade] if @core_fields[clade]
end
res
end
+ def gnub_archive?
+ @core_fields[:originalnameusageidpath]
+ end
end
end