lib/relaton_nist/data_fetcher.rb in relaton-nist-1.16.2 vs lib/relaton_nist/data_fetcher.rb in relaton-nist-1.16.3
- old
+ new
@@ -2,24 +2,11 @@
require "yaml"
module RelatonNist
class DataFetcher
- RELATION_TYPES = {
- "replaces" => "obsoletes",
- "isVersionOf" => "editionOf",
- "hasTranslation" => "hasTranslation",
- "isTranslationOf" => "translatedFrom",
- "hasPreprint" => "hasReprint",
- "isPreprintOf" => "hasDraft",
- "isSupplementTo" => "complements",
- "isPartOf" => "partOf",
- "hasPart" => "hasPart",
- }.freeze
-
URL = "https://raw.githubusercontent.com/usnistgov/NIST-Tech-Pubs/nist-pages/xml/allrecords.xml"
- NS = "http://www.crossref.org/relations.xsd"
def initialize(output, format)
@output = output
@format = format
@ext = format.sub(/^bib/, "")
@@ -28,341 +15,72 @@
def index
@index ||= Relaton::Index.find_or_create :nist, file: "index-v1.yaml"
end
- def parse_docid(doc) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
- [
- { type: "NIST", id: pub_id(doc), primary: true },
- { type: "DOI", id: fetch_doi(doc) },
- # { type: "NIST", id: anchor(doc), scope: "anchor" },
- ]
- end
-
- #
- # Parse document's ID from XML
- #
- # @param [Nokogiri::XML::Element] doc XML element
- #
- # @return [String] document's ID
- #
- def pub_id(doc)
- # anchor(doc).gsub(".", " ")
- fetch_doi(doc).split("/")[1..].join("/").gsub(".", " ").sub(/^nist\sir/, "NIST IR")
- end
-
- def fetch_doi(doc) # rubocop:disable Metrics/CyclomaticComplexity
- id = doc.at("doi_data/doi").text
- case id
- when "10.6028/NBS.CIRC.e2e" then "10.6028/NBS.CIRC.2e2"
- when "10.6028/NBS.CIRC.sup" then "10.6028/NBS.CIRC.24e7sup"
- when "10.6028/NBS.CIRC.supJun1925-Jun1926" then "10.6028/NBS.CIRC.24e7sup2"
- when "10.6028/NBS.CIRC.supJun1925-Jun1927" then "10.6028/NBS.CIRC.24e7sup3"
- when "10.6028/NBS.CIRC.24supJuly1922" then "10.6028/NBS.CIRC.24e6sup"
- when "10.6028/NBS.CIRC.24supJan1924" then "10.6028/NBS.CIRC.24e6sup2"
- else id
- end
- end
-
- # def anchor(doc)
- # fetch_doi(doc).split("/")[1..-1].join "/"
- # end
-
- # @param doc [Nokogiri::XML::Element]
- # @return [Array<RelatonBib::DocumentIdentifier>]
- def fetch_docid(doc)
- parse_docid(doc).map do |id|
- RelatonBib::DocumentIdentifier.new(**id)
- end
- end
-
- # @param doc [Nokogiri::XML::Element]
- # @return [RelatonBib::TypedTitleStringCollection, Array]
- def fetch_title(doc)
- t = doc.xpath("titles/title|titles/subtitle")
- return [] unless t.any?
-
- # RelatonBib::TypedTitleString.from_string t.map(&:text).join, "en", "Latn"
- [{ content: t.map(&:text).join, language: "en", script: "Latn",
- format: "text/plain" }]
- end
-
- # @param doc [Nokogiri::XML::Element]
- # @return [Array<RelatonBib::BibliographicDate>]
- def fetch_date(doc)
- doc.xpath("publication_date|approval_date").map do |dt|
- on = dt.at("year").text
- if (m = dt.at "month")
- on += "-#{m.text}"
- d = dt.at "day"
- on += "-#{d.text}" if d
- end
- type = dt.name == "publication_date" ? "published" : "confirmed"
- RelatonBib::BibliographicDate.new(type: type, on: on)
- end
- end
-
- # @param doc [Nokogiri::XML::Element]
- # @return [String]
- def fetch_edition(doc)
- doc.at("edition_number")&.text
- end
-
- # @param doc [Nokogiri::XML::Element]
- # @return [Array<Hash>]
- def fetch_relation(doc) # rubocop:disable Metrics/AbcSize
- doc.xpath("./ns:program/ns:related_item", ns: NS).map do |rel|
- rdoi = rel.at_xpath("ns:intra_work_relation|ns:inter_work_relation", ns: NS)
- id = rdoi.text.split("/")[1..].join("/").gsub(".", " ")
- fref = RelatonBib::FormattedRef.new content: id
- docid = RelatonBib::DocumentIdentifier.new(type: "NIST", id: id, primary: true)
- bibitem = RelatonBib::BibliographicItem.new formattedref: fref, docid: [docid]
- type = RELATION_TYPES[rdoi["relationship-type"]]
- warn "Relation type #{rdoi['relationship-type']} not found" unless type
- { type: type, bibitem: bibitem }
- end
- end
-
- def fetch_status(doc)
- s = doc.at("./ns:program/ns:related_item/ns:*[@relationship-type='isPreprintOf']", ns: NS)
- return unless s
-
- RelatonBib::DocumentStatus.new stage: "preprint"
- end
-
- # @param doc [Nokogiri::XML::Element]
- # @return [Array<RelatonBib::TypedUri>]
- def fetch_link(doc)
- pdf = doc.at("doi_data/resource").text
- doi = "https://doi.org/#{fetch_doi(doc)}"
- [{ type: "doi", content: doi }, { type: "pdf", content: pdf }].map do |l|
- RelatonBib::TypedUri.new(**l)
- end
- end
-
- # @param doc [Nokogiri::XML::Element]
- # @return [Array<RelatonBib::FormattedString>]
- def fetch_abstract(doc)
- doc.xpath(
- "jats:abstract/jats:p", "jats" => "http://www.ncbi.nlm.nih.gov/JATS1"
- ).each_with_object([]) do |a, m|
- next if a.text.empty?
-
- m << RelatonBib::FormattedString.new(content: a.text, language: doc["language"], script: "Latn")
- end
- end
-
- # @param doc [Nokogiri::XML::Element]
- # @return [Array<Hash>]
- def fetch_contributor(doc)
- contribs = doc.xpath("contributors/person_name").map do |p|
- person = RelatonBib::Person.new(name: fullname(p, doc),
- affiliation: affiliation(doc))
- { entity: person, role: [{ type: p["contributor_role"] }] }
- end
- contribs + doc.xpath("publisher").map do |p|
- { entity: create_org(p), role: [{ type: "publisher" }] }
- end
- end
-
- #
- # Create full name object from person name element.
- #
- # @param [Nokogiri::XML::Element] person name element
- # @param [Nokogiri::XML::Element] doc document element
- #
- # @return [RelatonBib::FullName] full name object
- #
- def fullname(person, doc)
- forename, initials = forename_initial(person, doc)
- surname = localized_string person.at("surname").text, doc
- ident = person.xpath("ORCID").map do |id|
- RelatonBib::PersonIdentifier.new "orcid", id.text
- end
- RelatonBib::FullName.new(surname: surname, forename: forename,
- initials: initials, identifier: ident)
- end
-
- #
- # Create forename and initials objects from person name element.
- #
- # @param [Nokogiri::XML::Element] person person name element
- # @param [Nokogiri::XML::Element] doc document element
- #
- # @return [Array<Array<RelatonBib::LocalizedString>>] forename and initials
- #
- def forename_initial(person, doc) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
- fnames = []
- fname = person.at("given_name")&.text
- if fname
- if /^(?<inits>(?:\w[.\s]+|[A-Z]{1,2}$)+)$/ =~ fname
- ints = inits.split(/[.\s]*/)
- fnames << forename(doc, fname, ints.shift)
- ints.each { |i| fnames << forename(doc, nil, i) }
- else
- fn = forename(doc, fname)
- fnames << fn if fn
- end
- end
- initials = localized_string inits, doc if not(inits.nil? || inits.empty?)
- [fnames, initials]
- end
-
- #
- # Create forename object
- #
- # @param [Nokogiri::XML::Element] doc document element
- # @param [String, nil] cnt forename content
- # @param [String, nil] init initial content
- #
- # @return [RelatonBib::Forename] forename object
- #
- def forename(doc, cnt, init = nil)
- return if (cnt.nil? || cnt.empty?) && (init.nil? || init.empty?)
-
- RelatonBib::Forename.new(
- content: cnt, language: doc["language"], script: "Latn", initial: init,
- )
- end
-
- #
- # Create localized string
- #
- # @param [String] content content of string
- # @param [Nokogiri::XML::Elemrnt] doc XML element
- #
- # @return [RelatonBib::LocalizedString] localized string
- #
- def localized_string(content, doc)
- RelatonBib::LocalizedString.new content, doc["language"], "Latn"
- end
-
- #
- # Create publisher organization
- #
- # @param [Nokogiri::XML::Element] pub publisher element
- #
- # @return [RelatonBib::Organization] publisher organization
- #
- def create_org(pub)
- name = pub.at("publisher_name").text
- abbr = pub.at("../institution[institution_name[.='#{name}']]/institution_acronym")&.text
- place = pub.at("./publisher_place") ||
- pub.at("../institution[institution_name[.='#{name}']]/institution_place")
- cont = []
- if place
- city, state = place.text.split(", ")
- cont << RelatonBib::Address.new(street: [], city: city, state: state, country: "US")
- end
- RelatonBib::Organization.new name: name, abbreviation: abbr, contact: cont
- end
-
- #
- # Create affiliation organization
- #
- # @param [Nokogiri::XML::Element] doc affiliation element
- #
- # @return [Array<RelatonBib::Affiliation>] affiliation
- #
- def affiliation(doc)
- doc.xpath("./institution/institution_department").map do |id|
- org = RelatonBib::Organization.new name: id.text
- RelatonBib::Affiliation.new organization: org
- end
- end
-
- # @param doc [Nokogiri::XML::Element]
- # @return [Array<String>]
- def fetch_place(doc)
- doc.xpath("institution/institution_place").map(&:text)
- end
-
- #
- # Fetches series
- #
- # @param [Nokogiri::XML::Element] doc document element
- #
- # @return [Array<RelatonBib::Series>] series
- #
- def fetch_series(doc)
- prf, srs, num = pub_id(doc).split
- sname = series[srs] || srs
- title = RelatonBib::TypedTitleString.new(content: "#{prf} #{sname}")
- abbr = RelatonBib::LocalizedString.new srs
- [RelatonBib::Series.new(title: title, abbreviation: abbr, number: num)]
- end
-
def series
@series ||= YAML.load_file File.expand_path("series.yaml", __dir__)
end
#
# Save document
#
# @param bib [RelatonNist::NistBibliographicItem]
#
- def write_file(bib) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
+ def write_file(bib) # rubocop:disable Metrics/AbcSize
id = bib.docidentifier[0].id.gsub(%r{[/\s:.]}, "_").upcase.sub(/^NIST_IR/, "NISTIR")
file = File.join(@output, "#{id}.#{@ext}")
if @files.include? file
warn "File #{file} exists. Docid: #{bib.docidentifier[0].id}"
# warn "Link: #{bib.link.detect { |l| l.type == 'src' }.content}"
else @files << file
end
- output = case @format
- when "yaml" then bib.to_hash.to_yaml
- when "xml" then bib.to_xml bibdata: true
- else bib.send "to_#{@format}"
- end
index.add_or_update bib.docidentifier[0].id, file
- File.write file, output, encoding: "UTF-8"
+ File.write file, output(bib), encoding: "UTF-8"
end
- #
- # Create a document instance an save it.
- #
- # @param doc [Nokogiri::XML::Element]
- #
- # @raise [StandardError]
- #
- def parse_doc(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
- # mtd = doc.at('doi_record/report-paper/report-paper_metadata')
- item = RelatonNist::NistBibliographicItem.new(
- type: "standard", docid: fetch_docid(doc),
- title: fetch_title(doc), link: fetch_link(doc), abstract: fetch_abstract(doc),
- date: fetch_date(doc), edition: fetch_edition(doc),
- contributor: fetch_contributor(doc), relation: fetch_relation(doc),
- docstatus: fetch_status(doc), place: fetch_place(doc), series: fetch_series(doc),
- language: [doc["language"]], script: ["Latn"], doctype: "standard"
- )
- write_file item
- rescue StandardError => e
- warn "Document: #{doc.at('doi').text}"
- warn e.message
- warn e.backtrace[0..5].join("\n")
- # raise e
+ def output(bib)
+ case @format
+ when "yaml" then bib.to_hash.to_yaml
+ when "xml" then bib.to_xml bibdata: true
+ else bib.send "to_#{@format}"
+ end
end
#
# Fetch all the documnts from dataset
#
def fetch # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
t1 = Time.now
puts "Started at: #{t1}"
- docs = Nokogiri::XML OpenURI.open_uri URL
FileUtils.mkdir_p @output
FileUtils.rm Dir[File.join(@output, "*.#{@ext}")]
- docs.xpath("/body/query/doi_record/report-paper/report-paper_metadata")
- .each { |doc| parse_doc doc }
+ fetch_tech_pubs
+ add_static_files
index.save
+
t2 = Time.now
puts "Stopped at: #{t2}"
puts "Done in: #{(t2 - t1).round} sec."
rescue StandardError => e
warn e.message
warn e.backtrace[0..5].join("\n")
+ end
+
+ def fetch_tech_pubs
+ docs = Nokogiri::XML OpenURI.open_uri URL
+ docs.xpath(
+ "/body/query/doi_record/report-paper/report-paper_metadata",
+ ).each { |doc| write_file TechPubsParser.parse(doc, series) }
+ end
+
+ def add_static_files
+ Dir["./static/*.yaml"].each do |file|
+ hash = YAML.load_file file
+ write_file RelatonNist::NistBibliographicItem.from_hash(hash)
+ end
end
#
# Fetch all the documnts from dataset
#