lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.5.1 vs lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.5.2
- old
+ new
@@ -20,21 +20,18 @@
class SynonymNormalized < Struct.new(:name, :canonical_name, :status);end
class VernacularNormalized < Struct.new(:name, :language);end
class ClassificationNormalizer
- attr_accessor :verbose
attr_reader :error_names, :tree
- def initialize(dwc_instance, verbose = false)
+ def initialize(dwc_instance)
@dwc = dwc_instance
@core_fields = get_fields(@dwc.core)
@extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
@res = {}
@parser = ParsleyStore.new(1,2)
- @verbose = verbose
- @verbose_count = 10000
@name_strings = {}
@error_names = []
@tree = {}
end
@@ -45,13 +42,16 @@
def name_strings
@name_strings.keys
end
def normalize
+ DarwinCore.logger_write(@dwc.object_id, "Started normalization of the classification")
@res = {}
ingest_core
+ DarwinCore.logger_write(@dwc.object_id, "Calculating the classification parent/child paths")
calculate_classification_path
+ DarwinCore.logger_write(@dwc.object_id, "Ingesting data from extensions")
ingest_extensions
@res
end
private
@@ -92,30 +92,27 @@
row[fields[:scientificname]] = scientific_name
end
def ingest_core
raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core_fields[:id] && @core_fields[:scientificname])
- puts "Reading core information" if @verbose
- rows = @dwc.core.read[0]
- puts "Ingesting information from the core" if @verbose
- rows.each_with_index do |r, i|
- count = i + 1
- set_scientific_name(r, @core_fields)
- puts "Ingesting %s'th record" % count if @verbose and count % @verbose_count == 0
- #core has AcceptedNameUsageId
- if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
- add_synonym_from_core(@core_fields[:acceptednameusageid], r)
- elsif !@core_fields[:acceptednameusageid] && status_synonym?(r[@core_fields[:taxonomicstatus]])
- add_synonym_from_core(parent_id, r)
- else
- taxon = @res[r[@core_fields[:id]]] ? @res[r[@core_fields[:id]]] : @res[r[@core_fields[:id]]] = DarwinCore::TaxonNormalized.new
- taxon.id = r[@core_fields[:id]]
- taxon.current_name = r[@core_fields[:scientificname]]
- taxon.current_name_canonical = r[@core_fields[:canonicalname]]
- taxon.parent_id = r[parent_id]
- taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
- taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
+ @dwc.core.read do |rows|
+ rows[0].each do |r|
+ set_scientific_name(r, @core_fields)
+ #core has AcceptedNameUsageId
+ if @core_fields[:acceptednameusageid] && r[@core_fields[:acceptednameusageid]] && r[@core_fields[:acceptednameusageid]] != r[@core_fields[:id]]
+ add_synonym_from_core(@core_fields[:acceptednameusageid], r)
+ elsif !@core_fields[:acceptednameusageid] && status_synonym?(r[@core_fields[:taxonomicstatus]])
+ add_synonym_from_core(parent_id, r)
+ else
+ taxon = @res[r[@core_fields[:id]]] ? @res[r[@core_fields[:id]]] : @res[r[@core_fields[:id]]] = DarwinCore::TaxonNormalized.new
+ taxon.id = r[@core_fields[:id]]
+ taxon.current_name = r[@core_fields[:scientificname]]
+ taxon.current_name_canonical = r[@core_fields[:canonicalname]]
+ taxon.parent_id = r[parent_id]
+ taxon.rank = r[@core_fields[:taxonrank]] if @core_fields[:taxonrank]
+ taxon.status = r[@core_fields[:taxonomicstatus]] if @core_fields[:taxonomicstatus]
+ end
end
end
end
def parent_id
@@ -170,35 +167,33 @@
ingest_vernaculars(e) if fields.keys.include? :vernacularname
end
end
def ingest_synonyms(extension)
- puts "Ingesting synonyms extension" if @verbose
+ DarwinCore.logger_write(@dwc.object_id, "Ingesting synonyms extension")
ext, fields = *extension
- ext.read[0].each_with_index do |r, i|
- count = i + 1
- set_scientific_name(r, fields)
- puts "Ingesting %s'th record" % count if @verbose && count % @verbose_count == 0
- @res[r[fields[:id]]].synonyms << SynonymNormalized.new(
- r[fields[:scientificname]],
- r[fields[:canonicalname]],
- fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil)
+ ext.read do |rows|
+ rows[0].each do |r|
+ set_scientific_name(r, fields)
+ @res[r[fields[:id]]].synonyms << SynonymNormalized.new(
+ r[fields[:scientificname]],
+ r[fields[:canonicalname]],
+ fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil)
+ end
end
end
def ingest_vernaculars(extension)
- puts "Ingesting vernacular names" if @verbose
+ DarwinCore.logger_write(@dwc.object_id, "Ingesting vernacular names extension")
ext, fields = *extension
- ext.read[0].each_with_index do |r, i|
- count = i + 1
- puts "Ingesting %s'th record" % count if @verbose && count % @verbose_count == 0
- @res[r[fields[:id]]].vernacular_names << VernacularNormalized.new(
- r[fields[:vernacularname]],
- fields[:languagecode] ? r[fields[:languagecode]] : nil)
- add_name_string(r[fields[:vernacularname]])
+ ext.read do |rows|
+ rows[0].each do |r|
+ @res[r[fields[:id]]].vernacular_names << VernacularNormalized.new(
+ r[fields[:vernacularname]],
+ fields[:languagecode] ? r[fields[:languagecode]] : nil)
+ add_name_string(r[fields[:vernacularname]])
+ end
end
end
end
end
-
-