lib/dwc_archive/classification_normalizer.rb in dwc-archive-1.1.0 vs lib/dwc_archive/classification_normalizer.rb in dwc-archive-1.1.1
- old
+ new
@@ -1,20 +1,20 @@
-# encoding: utf-8
+# frozen_string_literal: true
+
class DarwinCore
# Returns tree representation of Darwin Core file with vernacular and
# and synonyms attached to the taxon nodes
class ClassificationNormalizer
attr_reader :error_names, :tree, :normalized_data, :dwc
- alias_method :darwin_core, :dwc
+ alias darwin_core dwc
def initialize(dwc_instance)
@dwc = dwc_instance
@core_fields = find_fields(@dwc.core)
@extensions = @dwc.extensions.map { |e| [e, find_fields(e)] }
@normalized_data = {}
@synonyms = {}
- @parser = ::Biodiversity::Parser
@name_strings = {}
@vernacular_name_strings = {}
@error_names = []
@tree = {}
end
@@ -23,10 +23,11 @@
@name_strings[name_string] = 1 unless @name_strings[name_string]
end
def add_vernacular_name_string(name_string)
return if @vernacular_name_strings[name_string]
+
@vernacular_name_strings[name_string] = 1
end
def name_strings(opts = {})
process_strings(@name_strings, opts)
@@ -68,13 +69,13 @@
end
end
def get_canonical_name(a_scientific_name)
return nil unless @with_canonical_names
- canonical_name = nil
- parsed = @parser.parse(a_scientific_name)
- canonical_name = parsed[:canonicalName][:simple] if parsed[:parsed]
+
+ canonical_name = Biodiversity::Parser.parse(a_scientific_name).
+ dig(:canonicalName, :simple)
canonical_name.to_s.empty? ? a_scientific_name : canonical_name
end
def find_fields(element)
data = element.fields.each_with_object({}) do |f, h|
@@ -85,19 +86,17 @@
data[:id] = element.id[:index] if element.id
data
end
def status_synonym?(status)
- status && status.match(/^syn/)
+ status&.match(/^syn/)
end
def add_synonym_from_core(taxon_id, row)
cf = @core_fields
@synonyms[row[cf[:id]]] = taxon_id
- unless @normalized_data[row[taxon_id]]
- @normalized_data[row[taxon_id]] = DarwinCore::TaxonNormalized.new
- end
+ @normalized_data[row[taxon_id]] = DarwinCore::TaxonNormalized.new unless @normalized_data[row[taxon_id]]
taxon = @normalized_data[row[taxon_id]]
synonym = SynonymNormalized.new(
row[cf[:id]],
row[cf[:scientificname]],
@@ -105,48 +104,43 @@
cf[:taxonomicstatus] ? row[cf[:taxonomicstatus]] : nil,
cf[:source] ? row[cf[:source]] : nil,
cf[:localid] ? row[cf[:localid]] : nil,
cf[:globalid] ? row[cf[:globalid]] : nil
)
- taxon.synonyms << synonym
+ taxon.synonyms << synonym
add_name_string(synonym.name)
add_name_string(synonym.canonical_name)
end
def set_scientific_name(row, fields)
row[fields[:scientificname]] = "N/A" unless row[fields[:scientificname]]
canonical_name = nil
scientific_name = row[fields[:scientificname]].strip
if separate_canonical_and_authorship?(row, fields)
- if @with_canonical_names
- canonical_name = row[fields[:scientificname]].strip
- end
+ canonical_name = row[fields[:scientificname]].strip if @with_canonical_names
scientific_name += " #{row[fields[:scientificnameauthorship]].strip}"
else
- if @with_canonical_names
- canonical_name = get_canonical_name(row[fields[:scientificname]])
- end
+ canonical_name = get_canonical_name(row[fields[:scientificname]]) if @with_canonical_names
end
fields[:canonicalname] = row.size
row << canonical_name
row[fields[:scientificname]] = scientific_name
end
def separate_canonical_and_authorship?(row, fields)
authorship = ""
- if fields[:scientificnameauthorship]
- authorship = row[fields[:scientificnameauthorship]].to_s.strip
- end
+ authorship = row[fields[:scientificnameauthorship]].to_s.strip if fields[:scientificnameauthorship]
!(authorship.empty? || row[fields[:scientificname]].index(authorship))
end
def ingest_core
@normalized_data = {}
has_name_and_id = @core_fields[:id] && @core_fields[:scientificname]
- fail(DarwinCore::CoreFileError,
- "Darwin Core core fields must contain taxon id and scientific name"
- ) unless has_name_and_id
+ unless has_name_and_id
+ raise(DarwinCore::CoreFileError,
+ "Darwin Core core fields must contain taxon id and scientific name")
+ end
@dwc.core.read do |rows|
rows[1].each do |error|
@error_names << { data: error,
error: :reading_or_encoding_error }
end
@@ -161,36 +155,32 @@
@core_fields[:taxonomicstatus] &&
status_synonym?(r[@core_fields[:taxonomicstatus]])
add_synonym_from_core(parent_id, r) if parent_id?
else
unless @normalized_data[r[@core_fields[:id]]]
- if gnub_archive?
- new_taxon = DarwinCore::GnubTaxon.new
- else
- new_taxon = DarwinCore::TaxonNormalized.new
- end
+ new_taxon = if gnub_archive?
+ DarwinCore::GnubTaxon.new
+ else
+ DarwinCore::TaxonNormalized.new
+ end
@normalized_data[r[@core_fields[:id]]] = new_taxon
end
taxon = @normalized_data[r[@core_fields[:id]]]
if gnub_archive?
taxon.uuid = r[@core_fields[:originalnameusageid]]
taxon.uuid_path = r[@core_fields[:originalnameusageidpath]].
- split("|")
+ split("|")
end
taxon.id = r[@core_fields[:id]]
taxon.current_name = r[@core_fields[:scientificname]]
taxon.current_name_canonical = r[@core_fields[:canonicalname]]
taxon.parent_id = parent_id? ? r[parent_id] : nil
taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
- if @core_fields[:taxonomicstatus]
- taxon.status = r[@core_fields[:taxonomicstatus]]
- end
+ taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
taxon.source = r[@core_fields[:source]] if @core_fields[:source]
taxon.local_id = r[@core_fields[:localid]] if @core_fields[:localid]
- if @core_fields[:globalid]
- taxon.global_id = r[@core_fields[:globalid]]
- end
+ taxon.global_id = r[@core_fields[:globalid]] if @core_fields[:globalid]
taxon.linnean_classification_path =
get_linnean_classification_path(r, taxon)
add_name_string(taxon.current_name)
has_canonical = taxon.current_name_canonical &&
!taxon.current_name_canonical.empty?
@@ -211,65 +201,63 @@
def calculate_classification_path
@paths_num = 0
@normalized_data.each do |_taxon_id, taxon|
next unless taxon.classification_path_id.empty?
+
res = get_classification_path(taxon)
next if res == "error"
end
end
def get_classification_path(taxon)
return unless taxon.classification_path_id.empty?
+
@paths_num += 1
if @paths_num % 10_000 == 0
DarwinCore.logger_write(@dwc.object_id,
"Calculated #{@paths_num} paths")
end
current_node = { taxon.id => {} }
if DarwinCore.nil_field?(taxon.parent_id)
- if @with_canonical_names
- taxon.classification_path << taxon.current_name_canonical
- end
+ taxon.classification_path << taxon.current_name_canonical if @with_canonical_names
taxon.classification_path_id << taxon.id
@tree.merge!(current_node)
else
parent_cp = parent_cpid = nil
if @normalized_data[taxon.parent_id]
- if @with_canonical_names
- parent_cp = @normalized_data[taxon.parent_id].classification_path
- end
+ parent_cp = @normalized_data[taxon.parent_id].classification_path if @with_canonical_names
parent_cpid = @normalized_data[taxon.parent_id].
classification_path_id
else
current_parent = @normalized_data[@synonyms[taxon.parent_id]]
if current_parent
@error_names << { data: taxon,
error: :deprecated_parent,
current_parent: current_parent }
- if @with_canonical_names
- parent_cp = current_parent.classification_path
- end
+ parent_cp = current_parent.classification_path if @with_canonical_names
parent_cpid = current_parent.classification_path_id
else
@error_names << { data: taxon,
error: :deprecated_parent,
current_parent: nil }
end
end
return "error" unless parent_cpid
+
if parent_cpid.empty?
res = "error"
begin
res = get_classification_path(@normalized_data[taxon.parent_id])
rescue SystemStackError
@error_names << { data: taxon,
error: :too_deep_hierarchy,
current_parent: nil }
end
return res if res == "error"
+
if @with_canonical_names
taxon.classification_path += @normalized_data[taxon.parent_id].
classification_path +
[taxon.current_name_canonical]
end
@@ -293,11 +281,11 @@
begin
parent_node.merge!(current_node)
rescue NoMethodError => e
DarwinCore.logger_write(@dwc.object_id,
"Error '#{e.message}' taxon #{taxon.id}")
- return "error"
+ "error"
end
end
end
end
@@ -379,11 +367,11 @@
(fields[:languagecode] && row[fields[:languagecode]]) || nil
end
# Collect linnean classification path only on species level
def get_linnean_classification_path(row, _taxon)
- [:kingdom, :phylum, :class, :order, :family, :genus,
- :subgenus].each_with_object([]) do |clade, res|
+ %i[kingdom phylum class order family genus
+ subgenus].each_with_object([]) do |clade, res|
res << [row[@core_fields[clade]], clade] if @core_fields[clade]
end
end
def gnub_archive?