require "relaton_bib" module RelatonNist class Scrapper class << self DOMAIN = "https://csrc.nist.gov".freeze # rubocop:disable Metrics/AbcSize, Metrics/MethodLength # Parse page. # @param hit_data [Hash] # @return [Hash] def parse_page(hit_data) hit_data[:path] ? fetch_gh(hit_data) : parse_json(hit_data) end def fetch_gh(hit_data) yaml = OpenURI.open_uri "#{HitCollection::GHNISTDATA}#{hit_data[:path]}" hash = YAML.safe_load yaml hash["fetched"] = Date.today.to_s NistBibliographicItem.from_hash hash end def parse_json(hit_data) item_data = from_json hit_data titles = fetch_titles(hit_data) # unless /^(SP|NISTIR|FIPS) /.match? item_data[:docid][0].id # item_data[:docid][0] = RelatonBib::DocumentIdentifier.new( # id: titles[0][:content].upcase, type: "NIST", primary: true, # ) # end item_data[:fetched] = Date.today.to_s item_data[:type] = "standard" item_data[:title] = titles item_data[:doctype] = "standard" NistBibliographicItem.new(**item_data) end private def from_json(hit_data) json = hit_data[:json] { link: fetch_link(json), docid: fetch_docid(hit_data), date: fetch_dates(json, hit_data[:release_date]), contributor: fetch_contributors(json), edition: fetch_edition(json), language: [json["language"]], script: [json["script"]], docstatus: fetch_status(json), # hit_data[:status]), copyright: fetch_copyright(json["published-date"]), relation: fetch_relations_json(json), place: ["Gaithersburg, MD"], keyword: fetch_keywords(json), commentperiod: fetch_commentperiod_json(json), } end # rubocop:enable Metrics/AbcSize, Metrics/MethodLength # Fetch docid. # @param hit [RelatonHist::Hit] # @return [Array] def fetch_docid(hit) # item_ref = docid # json["docidentifier"] # item_ref ||= "?" # item_ref.sub!(/\sAddendum$/, "-Add") ids = [RelatonBib::DocumentIdentifier.new(id: hit[:code], type: "NIST", primary: true)] doi = hit[:json]["doi"]&.split("/")&.last ids << RelatonBib::DocumentIdentifier.new(id: doi, type: "DOI") if doi ids end # Fetch status. # @param doc [Hash] # @return [RelatonNist::DocumentStatus] def fetch_status(doc) stage = doc["status"] subst = doc["substage"] iter = doc["iteration"] == "initial" ? 1 : doc["iteration"] RelatonNist::DocumentStatus.new stage: stage, substage: subst, iteration: iter.to_s end # Fetch titles. # @param hit_data [Hash] # @return [Array] def fetch_titles(hit_data) [{ content: hit_data[:title], language: "en", script: "Latn", format: "text/plain" }] end # Fetch dates # @param doc [Hash] # @param release_date [Date] # @return [Array] def fetch_dates(doc, release_date) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength dates = [{ type: "published", on: release_date.to_s }] # if doc.is_a? Hash issued = RelatonBib.parse_date doc["issued-date"] updated = RelatonBib.parse_date doc["updated-date"] dates << { type: "updated", on: updated.to_s } if updated obsoleted = RelatonBib.parse_date doc["obsoleted-date"] dates << { type: "obsoleted", on: obsoleted.to_s } if obsoleted # else # d = doc.at("//span[@id='pub-release-date']")&.text&.strip # issued = RelatonBib.parse_date d # end dates << { type: "issued", on: issued.to_s } dates end # @param doc [Hash] # @return [Array] def fetch_contributors(doc) contribs = [] # if doc.is_a? Hash contribs += contributors_json( doc["authors"], "author", doc["language"], doc["script"] ) contribs + contributors_json( doc["editors"], "editor", doc["language"], doc["script"] ) end # @param doc [Array] # @param role [String] # @return [Array] def contributors_json(doc, role, lang = "en", script = "Latn") # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/MethodLength,Metrics/PerceivedComplexity doc.map do |contr| if contr["affiliation"] if contr["affiliation"]["acronym"] abbrev = RelatonBib::LocalizedString.new(contr["affiliation"]["acronym"]) end org = RelatonBib::Organization.new( name: contr["affiliation"]["name"], abbreviation: abbrev, ) end if contr["surname"] affiliation = [] affiliation << RelatonBib::Affiliation.new(organization: org) if org entity = RelatonBib::Person.new( name: full_name(contr, lang, script), affiliation: affiliation, ) elsif org entity = org end if entity RelatonBib::ContributionInfo.new entity: entity, role: [type: role] end end.compact end # @param name [Hash] # @param lang [Strong] # @param script [String] # @return [RelatonBib::FullName] def full_name(name, lang, script) RelatonBib::FullName.new( surname: RelatonBib::LocalizedString.new(name["surname"], lang, script), forename: name_parts(name["givenName"], lang, script), addition: name_parts(name["suffix"], lang, script), prefix: name_parts(name["title"], lang, script), completename: RelatonBib::LocalizedString.new(name["fullName"], lang, script), ) end # @param part [String, NilClass] # @param lang [Strong] # @param script [String] # @return [Array] def name_parts(part, lang, script) return [] unless part [RelatonBib::LocalizedString.new(part, lang, script)] end # @param doc [Hash] # @return [String, NilClass] def fetch_edition(doc) # if doc.is_a? Hash return unless doc["edition"] rev = doc["edition"] "Revision #{rev}" end # Fetch copyright. # @param doc [Nokogiri::HTL::Document, String] # @return [Array] def fetch_copyright(doc) name = "National Institute of Standards and Technology" url = "www.nist.gov" from = doc&.match(/\d{4}/)&.to_s [{ owner: [{ name: name, abbreviation: "NIST", url: url }], from: from }] end # Fetch links. # @param doc [Hash] # @return [Array] def fetch_link(doc) links = [] links << { type: "src", content: doc["uri"] } if doc["uri"] if doc["doi"] links << { type: "doi", content: "https://doi.org/#{doc['doi']}" } end links end def fetch_relations_json(doc) relations = doc["supersedes"].map do |r| doc_relation "supersedes", r["docidentifier"], r["uri"] end relations + doc["superseded-by"].map do |r| doc_relation "updates", r["docidentifier"], r["uri"] end end # @param type [String] # @param ref [String] # @param uri [String] # @return [RelatonNist::DocumentRelation] def doc_relation(type, ref, uri, lang = "en", script = "Latn") # rubocop:disable Metrics/MethodLength if type == "supersedes" descr = RelatonBib::FormattedString.new(content: "supersedes", language: lang, script: script) t = "obsoletes" else t = type end DocumentRelation.new( type: t, description: descr, bibitem: RelatonBib::BibliographicItem.new( formattedref: RelatonBib::FormattedRef.new( content: ref, language: lang, script: script, format: "text/plain", ), link: [RelatonBib::TypedUri.new(type: "src", content: uri)], ), ) end # @param doc [Hash] # @return [Array] def fetch_keywords(doc) doc["keywords"].map { |kw| kw.is_a?(String) ? kw : kw.text } end # @param json [Hash] # @return [RelatonNist::CommentPeriod, NilClass] def fetch_commentperiod_json(json) return unless json["comment-from"] CommentPeriod.new from: json["comment-from"], to: json["comment-to"] end end end end