lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.4.13 vs lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.4.14
- old
+ new
@@ -25,11 +25,11 @@
attr_accessor :verbose
attr_reader :error_names, :tree
def initialize(dwc_instance, verbose = false)
@dwc = dwc_instance
- @core = get_fields(@dwc.core)
+ @core_fields = get_fields(@dwc.core)
@extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
@res = {}
@parser = ParsleyStore.new(1,2)
@verbose = verbose
@verbose_count = 10000
@@ -54,11 +54,11 @@
@res
end
private
- def canonical_name(a_scientific_name)
+ def get_canonical_name(a_scientific_name)
if R19
a_scientific_name.force_encoding('utf-8')
end
begin
parsed_name = @parser.parse(a_scientific_name)[:scientificName]
@@ -82,42 +82,51 @@
end
def add_synonym_from_core(taxon_id, row)
taxon = @res[row[taxon_id]] ? @res[row[taxon_id]] : @res[row[taxon_id]] = DarwinCore::TaxonNormalized.new
taxon.synonyms << SynonymNormalized.new(
- row[@core[:scientificname]],
- canonical_name(row[@core[:scientificname]]),
- @core[:taxonomicstatus] ? row[@core[:taxonomicstatus]] : nil)
+ row[@core_fields[:scientificname]],
+ row[@core_fields[:canonicalname]],
+ @core_fields[:taxonomicstatus] ? row[@core_fields[:taxonomicstatus]] : nil)
end
+ def set_scientific_name(row, fields)
+ canonical_name = fields[:scientificnameauthorship] ? row[fields[:scientificname]] : get_canonical_name(row[fields[:scientificname]])
+ fields[:canonicalname] = row.size
+ row << canonical_name
+ scientific_name = (fields[:scientificnameauthorship] && row[fields[:scientificnameauthorship]].to_s.strip != '') ? row[fields[:scientificname]].strip + ' ' + row[fields[:scientificnameauthorship]].strip : row[fields[:scientificname]].strip
+ row[fields[:scientificname]] = scientific_name
+ end
+
def ingest_core
- raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core[:id] && @core[:scientificname])
+ raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core_fields[:id] && @core_fields[:scientificname])
puts "Reading core information" if @verbose
rows = @dwc.core.read[0]
puts "Ingesting information from the core" if @verbose
rows.each_with_index do |r, i|
count = i + 1
+ set_scientific_name(r, @core_fields)
puts "Ingesting %s'th record" % count if @verbose and count % @verbose_count == 0
#core has AcceptedNameUsageId
- if @core[:acceptednameusageid] && r[@core[:acceptednameusageid]] && r[@core[:acceptednameusageid]] != r[@core[:id]]
- add_synonym_from_core(@core[:acceptednameusageid], r)
- elsif !@core[:acceptednameusageid] && status_synonym?(r[@core[:taxonomicstatus]])
+ if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
+ add_synonym_from_core(@core_fields[:acceptednameusageid], r)
+ elsif !@core_fields[:acceptednameusageid] && status_synonym?(r[@core_fields[:taxonomicstatus]])
add_synonym_from_core(parent_id, r)
else
- taxon = @res[r[@core[:id]]] ? @res[r[@core[:id]]] : @res[r[@core[:id]]] = DarwinCore::TaxonNormalized.new
- taxon.id = r[@core[:id]]
- taxon.current_name = r[@core[:scientificname]]
- taxon.current_name_canonical = canonical_name(r[@core[:scientificname]])
+ taxon = @res[r[@core_fields[:id]]] ? @res[r[@core_fields[:id]]] : @res[r[@core_fields[:id]]] = DarwinCore::TaxonNormalized.new
+ taxon.id = r[@core_fields[:id]]
+ taxon.current_name = r[@core_fields[:scientificname]]
+ taxon.current_name_canonical = r[@core_fields[:canonicalname]]
taxon.parent_id = r[parent_id]
- taxon.rank = r[@core[:taxonrank]] if @core[:taxonrank]
- taxon.status = r[@core[:taxonomicstatus]] if @core[:taxonomicstatus]
+ taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
+ taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
end
end
end
def parent_id
- parent_id_field = @core[:highertaxonid] || @core[:parentnameusageid]
+ parent_id_field = @core_fields[:highertaxonid] || @core_fields[:parentnameusageid]
end
def calculate_classification_path
@res.each do |taxon_id, taxon|
next if !taxon.classification_path.empty?
@@ -170,13 +179,14 @@
def ingest_synonyms(extension)
puts "Ingesting synonyms extension" if @verbose
ext, fields = *extension
ext.read[0].each_with_index do |r, i|
count = i + 1
+ set_scientific_name(r, fields)
puts "Ingesting %s'th record" % count if @verbose && count % @verbose_count == 0
@res[r[fields[:id]]].synonyms << SynonymNormalized.new(
r[fields[:scientificname]],
- canonical_name(r[fields[:scientificname]]),
+ r[fields[:canonicalname]],
fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil)
end
end
def ingest_vernaculars(extension)