lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.7.3 vs lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.7.4
- old
+ new
@@ -42,27 +42,31 @@
def name_strings
@name_strings.keys
end
- def normalize
+ def normalize(opts = {:with_canoical_names => true})
+ @with_canonical_names = opts[:with_canonical_names] != nil ? opts[:with_canonical_names] : true
DarwinCore.logger_write(@dwc.object_id, "Started normalization of the classification")
- @normalized_data = {}
ingest_core
DarwinCore.logger_write(@dwc.object_id, "Calculating the classification parent/child paths")
- calculate_classification_path
+ has_parent_id? ? calculate_classification_path : @normalized_data.keys.each { |id| @tree[id] = {} }
DarwinCore.logger_write(@dwc.object_id, "Ingesting data from extensions")
ingest_extensions
@normalized_data
end
private
def get_canonical_name(a_scientific_name)
a_scientific_name.force_encoding('utf-8')
- canonical_name = @parser.parse(a_scientific_name, :canonical_only => true)
- canonical_name.to_s.empty? ? a_scientific_name : canonical_name
+ if @with_canonical_names
+ canonical_name = @parser.parse(a_scientific_name, :canonical_only => true)
+ canonical_name.to_s.empty? ? a_scientific_name : canonical_name
+ else
+ nil
+ end
end
def get_fields(element)
data = element.fields.inject({}) { |res, f| res[f[:term].split('/')[-1].downcase.to_sym] = f[:index].to_i; res }
data[:id] = element.id[:index]
@@ -86,17 +90,17 @@
add_name_string(synonym.canonical_name)
end
def set_scientific_name(row, fields)
row[fields[:scientificname]] = 'N/A' unless row[fields[:scientificname]]
- canonical_name = ''
+ canonical_name = nil
scientific_name = row[fields[:scientificname]].strip.force_encoding('utf-8')
if separate_canonical_and_authorship?(row, fields)
- canonical_name = row[fields[:scientificname]].strip.force_encoding('utf-8')
+ canonical_name = row[fields[:scientificname]].strip.force_encoding('utf-8') if @with_canonical_names
scientific_name += " #{row[fields[:scientificnameauthorship]].strip.force_encoding('utf-8')}"
else
- canonical_name = get_canonical_name(row[fields[:scientificname]])
+ canonical_name = get_canonical_name(row[fields[:scientificname]]) if @with_canonical_names
end
fields[:canonicalname] = row.size
row << canonical_name
row[fields[:scientificname]] = scientific_name
end
@@ -110,81 +114,94 @@
end
def ingest_core
+ @normalized_data = {}
raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core_fields[:id] && @core_fields[:scientificname])
@dwc.core.read do |rows|
+ rows[1].each do |error|
+ @error_names << { :data => error, :error => :reading_or_encoding_error }
+ end
rows[0].each do |r|
set_scientific_name(r, @core_fields)
#core has AcceptedNameUsageId
if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
add_synonym_from_core(@core_fields[:acceptednameusageid], r)
elsif !@core_fields[:acceptednameusageid] && @core_fields[:taxonomicstatus] && status_synonym?(r[@core_fields[:taxonomicstatus]])
- add_synonym_from_core(parent_id, r)
+ add_synonym_from_core(parent_id, r) if has_parent_id?
else
taxon = @normalized_data[r[@core_fields[:id]]] ? @normalized_data[r[@core_fields[:id]]] : @normalized_data[r[@core_fields[:id]]] = DarwinCore::TaxonNormalized.new
taxon.id = r[@core_fields[:id]]
taxon.current_name = r[@core_fields[:scientificname]]
taxon.current_name_canonical = r[@core_fields[:canonicalname]]
- taxon.parent_id = r[parent_id]
+ taxon.parent_id = has_parent_id? ? r[parent_id] : nil
taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
add_name_string(taxon.current_name)
- add_name_string(taxon.current_name_canonical)
+ add_name_string(taxon.current_name_canonical) if taxon.current_name_canonical && !taxon.current_name_canonical.empty?
end
end
end
end
+ def has_parent_id?
+ @has_parent_id ||= @core_fields.has_key?(:highertaxonid) || @core_fields.has_key?(:parentnameusageid)
+ end
+
def parent_id
parent_id_field = @core_fields[:highertaxonid] || @core_fields[:parentnameusageid]
end
def calculate_classification_path
@paths_num = 0
@normalized_data.each do |taxon_id, taxon|
- next if !taxon.classification_path.empty?
+ next if !taxon.classification_path_id.empty?
res = get_classification_path(taxon)
next if res == 'error'
end
end
def get_classification_path(taxon)
- return if !taxon.classification_path.empty?
+ return if !taxon.classification_path_id.empty?
@paths_num += 1
DarwinCore.logger_write(@dwc.object_id, "Calculated %s paths" % @paths_num) if @paths_num % 10000 == 0
current_node = {taxon.id => {}}
if DarwinCore.nil_field?(taxon.parent_id)
- taxon.classification_path << taxon.current_name_canonical
+ taxon.classification_path << taxon.current_name_canonical if @with_canonical_names
taxon.classification_path_id << taxon.id
@tree.merge!(current_node)
else
- parent_cp = nil
+ parent_cp = parent_cpid = nil
if @normalized_data[taxon.parent_id]
- parent_cp = @normalized_data[taxon.parent_id].classification_path
+ parent_cp = @normalized_data[taxon.parent_id].classification_path if @with_canonical_names
+ parent_cpid = @normalized_data[taxon.parent_id].classification_path_id
else
current_parent = @normalized_data[@synonyms[taxon.parent_id]]
if current_parent
error = "WARNING: The parent of the taxon \'#{taxon.current_name}\' is deprecated"
- @error_names << {:name => taxon, :error => :deprecated_parent, :current_parent => current_parent }
- parent_cp = current_parent.classification_path
+ @error_names << {:data => taxon, :error => :deprecated_parent, :current_parent => current_parent }
+
+ parent_cp = current_parent.classification_path if @with_canonical_names
+ parent_cpid = current_parent.classification_path_id
else
error = "WARNING: The parent of the taxon \'#{taxon.current_name}\' not found"
- @error_names << {:name => taxon, :error => :deprecated_parent, :current_parent => nil}
+ @error_names << {:data => taxon, :error => :deprecated_parent, :current_parent => nil}
end
end
- return 'error' unless parent_cp
- if parent_cp.empty?
+ return 'error' unless parent_cpid
+ if parent_cpid.empty?
res = get_classification_path(@normalized_data[taxon.parent_id])
return res if res == 'error'
- taxon.classification_path += @normalized_data[taxon.parent_id].classification_path + [taxon.current_name_canonical]
+ if @with_canonical_names
+ taxon.classification_path += @normalized_data[taxon.parent_id].classification_path + [taxon.current_name_canonical]
+ end
taxon.classification_path_id += @normalized_data[taxon.parent_id].classification_path_id + [taxon.id]
parent_node = @normalized_data[taxon.parent_id].classification_path_id.inject(@tree) {|node, id| node[id]}
parent_node.merge!(current_node)
else
- taxon.classification_path += parent_cp + [taxon.current_name_canonical]
- taxon.classification_path_id += @normalized_data[taxon.parent_id].classification_path_id + [taxon.id]
+ taxon.classification_path += parent_cp + [taxon.current_name_canonical] if @with_canonical_names
+ taxon.classification_path_id += parent_cpid + [taxon.id]
parent_node = @normalized_data[taxon.parent_id].classification_path_id.inject(@tree) {|node, id| node[id]}
begin
parent_node.merge!(current_node)
rescue NoMethodError => e
DarwinCore.logger_write(@dwc.object_id, "Error '%s' taxon %s" % [e.message, taxon.id])