lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.4.2 vs lib/dwc-archive/classification_normalizer.rb in dwc-archive-0.4.3
- old
+ new
@@ -1,10 +1,10 @@
# encoding: utf-8
-require 'biodiversity'
+require 'parsley-store'
class DarwinCore
-
+
class TaxonNormalized
attr_accessor :id, :parent_id, :classification_path, :current_name, :current_name_canonical, :synonyms, :vernacular_names, :rank, :status
def initialize
@id = @parent_id = @classification_path = @current_name = @current_name_canonical = @rank = @status = nil
@@ -16,16 +16,19 @@
class SynonymNormalized < Struct.new(:name, :canonical_name, :status);end
class VernacularNormalized < Struct.new(:name, :language);end
class ClassificationNormalizer
- def initialize(dwc_instance)
+
+ def initialize(dwc_instance, verbose = false)
@dwc = dwc_instance
@core = get_fields(@dwc.core)
@extensions = @dwc.extensions.map { |e| [e, get_fields(e)] }
@res = {}
- @parser = ScientificNameParser.new
+ @parser = ParsleyStore.new(1,2)
+ @verbose = verbose
+ @verbose_count = 1000
end
def normalize
injest_core
calculate_classification_path
@@ -40,11 +43,11 @@
a_scientific_name.force_encoding('utf-8')
end
begin
parsed_name = @parser.parse(a_scientific_name)[:scientificName]
rescue
- @parser = ScientificNameParser.new
+ @parser = ParsleyStore.new(1,2)
parsed_name = @parser.parse(a_scientific_name)[:scientificName]
end
parsed_name[:parsed] ? parsed_name[:canonical] : a_scientific_name
end
@@ -61,16 +64,21 @@
def add_synonym_from_core(taxon_id, row)
taxon = @res[row[taxon_id]] ? @res[row[taxon_id]] : @res[row[taxon_id]] = DarwinCore::TaxonNormalized.new
taxon.synonyms << SynonymNormalized.new(
row[@core[:scientificname]],
canonical_name(row[@core[:scientificname]]),
- row[@core[:taxonomicstatus]])
+ @core[:taxonomicstatus] ? row[@core[:taxonomicstatus]] : nil)
end
def injest_core
raise RuntimeError, "Darwin Core core fields must contain taxon id and scientific name" unless (@core[:id] && @core[:scientificname])
- @dwc.core.read[0].each do |r|
+ puts "Reading core information" if @verbose
+ rows = @dwc.core.read[0]
+ puts "Injesting information from the core" if @verbose
+ rows.each_with_index do |r, i|
+ count = i + 1
+ puts "Injesting %s'th record" % count if @verbose and count % @verbose_count == 0
#core has AcceptedNameUsageId
if @core[:acceptednameusageid] && r[@core[:acceptednameusageid]] && r[@core[:acceptednameusageid]] != r[@core[:id]]
add_synonym_from_core(@core[:acceptednameusageid], r)
elsif !@core[:acceptednameusageid] && status_synonym?(r[@core[:taxonomicstatus]])
add_synonym_from_core(parent_id, r)
@@ -100,11 +108,11 @@
def get_classification_path(taxon)
return if taxon.classification_path
if DarwinCore.nil_field?(taxon.parent_id)
taxon.classification_path = [taxon.current_name_canonical]
else
- parent_cp = @res[taxon.parent_id].classification_path
+ parent_cp = @res[taxon.parent_id].classification_path
if parent_cp
taxon.classification_path = parent_cp + [taxon.current_name_canonical]
else
get_classification_path(@res[taxon.parent_id])
taxon.classification_path = @res[taxon.parent_id].classification_path + [taxon.current_name_canonical]
@@ -119,25 +127,32 @@
injest_vernaculars(e) if fields.keys.include? :vernacularname
end
end
def injest_synonyms(extension)
+ puts "Injesting synonyms extension" if @verbose
ext, fields = *extension
- ext.read[0].each do |r|
+ ext.read[0].each_with_index do |r, i|
+ count = i + 1
+ puts "Injesting %s'th record" % count if @verbose && count % @verbose_count == 0
@res[r[fields[:id]]].synonyms << SynonymNormalized.new(
r[fields[:scientificname]],
canonical_name(r[fields[:scientificname]]),
fields[:taxonomicstatus] ? r[fields[:taxonomicstatus]] : nil)
end
end
def injest_vernaculars(extension)
+ puts "Injesting vernacular names" if @verbose
ext, fields = *extension
- ext.read[0].each do |r|
+ ext.read[0].each_with_index do |r, i|
+ count = i + 1
+ puts "Injesting %s'th record" % count if @verbose && count % @verbose_count == 0
@res[r[fields[:id]]].vernacular_names << VernacularNormalized.new(
r[fields[:vernacularname]],
fields[:languagecode] ? r[fields[:languagecode]] : nil)
end
end
end
end
+