lib/relaton_nist/data_fetcher.rb in relaton-nist-1.9.4 vs lib/relaton_nist/data_fetcher.rb in relaton-nist-1.9.6

- old
+ new

@@ -9,32 +9,49 @@ "isVersionOf" => "editionOf", "hasTranslation" => "hasTranslation", "isTranslationOf" => "translatedFrom", "hasPreprint" => "hasReprint", "isSupplementTo" => "complements", + "isPartOf" => "partOf", + "hasPart" => "hasPart", }.freeze URL = "https://raw.githubusercontent.com/usnistgov/NIST-Tech-Pubs/nist-pages/xml/allrecords.xml" def initialize(output, format) @output = output @format = format @ext = format.sub(/^bib/, "") end - def parse_docid(doc) - doi = doc.at("doi_data/doi").text - id = doc.at("publisher_item/item_number", "publisher_item/identifier").text.sub(%r{^/}, "") - case doi - when "10.6028/NBS.CIRC.12e2revjune" then id.sub!("13e", "12e") - when "10.6028/NBS.CIRC.36e2" then id.sub!("46e", "36e") - when "10.6028/NBS.HB.67suppJune1967" then id.sub!("1965", "1967") - when "10.6028/NBS.HB.105-1r1990" then id.sub!("105-1-1990", "105-1r1990") - when "10.6028/NIST.HB.150-10-1995" then id.sub!(/150-10$/, "150-10-1995") - end - [{ type: "NIST", id: id }, { type: "DOI", id: doi }] + def parse_docid(doc) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength + # case doi + # when "10.6028/NBS.CIRC.12e2revjune" then doi.sub!("13e", "12e") + # when "10.6028/NBS.CIRC.36e2" then doi.sub!("46e", "36e") + # when "10.6028/NBS.HB.67suppJune1967" then doi.sub!("1965", "1967") + # when "10.6028/NBS.HB.105-1r1990" then doi.sub!("105-1-1990", "105-1r1990") + # when "10.6028/NIST.HB.150-10-1995" then doi.sub!(/150-10$/, "150-10-1995") + # end + # anchor = doi.split("/")[1..-1].join "/" + [ + { type: "NIST", id: pub_id(doc) }, + { type: "DOI", id: doi(doc) }, + { type: "NIST", id: anchor(doc), scope: "anchor" }, + ] end + def pub_id(doc) + anchor(doc).gsub(".", " ") + end + + def doi(doc) + doc.at("doi_data/doi").text + end + + def anchor(doc) + doi(doc).split("/")[1..-1].join "/" + end + # @param doc [Nokogiri::XML::Element] # @return [Array<RelatonBib::DocumentIdentifier>] def fetch_docid(doc) parse_docid(doc).map do |id| RelatonBib::DocumentIdentifier.new(type: id[:type], id: id[:id]) @@ -45,11 +62,11 @@ # @return [RelatonBib::TypedTitleStringCollection, Array] def fetch_title(doc) t = doc.xpath("titles/title|titles/subtitle") return [] unless t.any? - RelatonBib::TypedTitleString.from_string t.map(&:text).join(" "), "en", "Latn" + RelatonBib::TypedTitleString.from_string t.map(&:text).join, "en", "Latn" end # @param doc [Nokogiri::XML::Element] # @return [Array<RelatonBib::BibliographicDate>] def fetch_date(doc) @@ -74,16 +91,15 @@ # @param doc [Nokogiri::XML::Element] # @return [Array<Hash>] def fetch_relation(doc) ns = "http://www.crossref.org/relations.xsd" doc.xpath("./ns:program/ns:related_item", ns: ns).map do |rel| - doi = rel.at_xpath("ns:intra_work_relation|ns:inter_work_relation", ns: ns) - # ref = doi_to_id doi.text - # ref, = parse_docid doc - fref = RelatonBib::FormattedRef.new content: doi.text + rdoi = rel.at_xpath("ns:intra_work_relation|ns:inter_work_relation", ns: ns) + fref = RelatonBib::FormattedRef.new content: rdoi.text bibitem = RelatonBib::BibliographicItem.new formattedref: fref - type = RELATION_TYPES[doi["relationship-type"]] + type = RELATION_TYPES[rdoi["relationship-type"]] + warn "Relation type #{rdoi['relationship-type']} not found" unless type { type: type, bibitem: bibitem } end end # @param doc [Nokogiri::XML::Element] @@ -121,26 +137,46 @@ RelatonBib::PersonIdentifier.new "orcid", id.text end fullname = RelatonBib::FullName.new( surname: surname, forename: forename, initial: initial, identifier: ident, ) - person = RelatonBib::Person.new name: fullname + person = RelatonBib::Person.new name: fullname, affiliation: affiliation(doc) { entity: person, role: [{ type: p["contributor_role"] }] } end contribs + doc.xpath("publisher").map do |p| abbr = p.at("../institution/institution_acronym")&.text - org = RelatonBib::Organization.new(name: p.at("publisher_name").text, abbreviation: abbr) + place = p.at("./publisher_place") + cont = [] + if place + city, state = place.text.split(", ") + cont << RelatonBib::Address.new(street: [], city: city, state: state, country: "US") + end + org = RelatonBib::Organization.new( + name: p.at("publisher_name").text, abbreviation: abbr, contact: cont, + ) { entity: org, role: [{ type: "publisher" }] } end end + def affiliation(doc) + doc.xpath("./institution/institution_department").map do |id| + org = RelatonBib::Organization.new name: id.text + RelatonBib::Affiliation.new organization: org + end + end + # @param doc [Nokogiri::XML::Element] # @return [Array<String>] def fetch_place(doc) doc.xpath("institution/institution_place").map(&:text) end + def fetch_series(doc) + title = RelatonBib::TypedTitleString.new(content: "NIST") + [RelatonBib::Series.new(title: title, number: pub_id(doc))] + end + # # Save document # # @param bib [RelatonNist::NistBibliographicItem] # @@ -172,18 +208,19 @@ item = RelatonNist::NistBibliographicItem.new( type: "standard", docid: fetch_docid(doc), title: fetch_title(doc), link: fetch_link(doc), abstract: fetch_abstract(doc), date: fetch_date(doc), edition: fetch_edition(doc), contributor: fetch_contributor(doc), relation: fetch_relation(doc), - place: fetch_place(doc), + place: fetch_place(doc), series: fetch_series(doc), language: [doc["language"]], script: ["Latn"], doctype: "standard" ) write_file item rescue StandardError => e warn "Document: #{doc.at('doi').text}" warn e.message - raise e + warn e.backtrace[0..5].join("\n") + # raise e end # # Fetch all the documnts from dataset # @@ -200,9 +237,10 @@ t2 = Time.now puts "Stopped at: #{t2}" puts "Done in: #{(t2 - t1).round} sec." rescue StandardError => e warn e.message + warn e.backtrace[0..5].join("\n") end # # Fetch all the documnts from dataset #