lib/relaton_calconnect/scrapper.rb in relaton-calconnect-1.9.0 vs lib/relaton_calconnect/scrapper.rb in relaton-calconnect-1.9.1

- old
+ new

@@ -1,41 +1,96 @@ module RelatonCalconnect module Scrapper DOMAIN = "https://standards.calconnect.org/".freeze + SCHEME, HOST = DOMAIN.split(%r{:?/?/}) # DOMAIN = "http://127.0.0.1:4000/".freeze class << self # papam hit [Hash] # @return [RelatonOgc::OrcBibliographicItem] def parse_page(hit) - link = hit["link"].detect { |l| l["type"] == "rxl" } + links = array(hit["link"]) + link = links.detect { |l| l["type"] == "rxl" } if link - bib_xml = fetch_bib_xml link["content"] - XMLParser.from_xml bib_xml + bib = fetch_bib_xml link["content"] + update_links bib, links + # XMLParser.from_xml bib_xml + else + bib = RelatonCalconnect::CcBibliographicItem.from_hash doc_to_hash(hit) end + bib.link.each { |l| l.content.merge!(scheme: SCHEME, host: HOST) unless l.content.host } + bib end private # @param url [String] # @return [String] XML def fetch_bib_xml(url) + # rxl = get_rxl url + # uri_rxl = rxl.at("uri[@type='rxl']") + # return rxl.to_xml unless uri_rxl + + # uri_xml = rxl.xpath("//uri").to_xml + # rxl = get_rxl uri_rxl.text + # docid = rxl.at "//docidentifier" + # docid.add_previous_sibling uri_xml + # rxl.to_xml rxl = get_rxl url uri_rxl = rxl.at("uri[@type='rxl']") - return rxl.to_xml unless uri_rxl - - uri_xml = rxl.xpath("//uri").to_xml - rxl = get_rxl uri_rxl.text - docid = rxl.at "//docidentifier" - docid.add_previous_sibling uri_xml - rxl.to_xml + if uri_rxl + uri_xml = rxl.xpath("//uri").to_xml + rxl = get_rxl uri_rxl.text + docid = rxl.at "//docidentifier" + docid.add_previous_sibling uri_xml + end + xml = rxl.to_xml.gsub!(%r{(</?)technical-committee(>)}, '\1committee\2') + RelatonCalconnect::XMLParser.from_xml xml end # @param path [String] # @return [Nokogiri::XML::Document] def get_rxl(path) resp = Faraday.get DOMAIN + path Nokogiri::XML resp.body + end + + # + # Fix editorial group + # + # @param [Hash] doc + # + # @return [Hash] + # + def doc_to_hash(doc) + array(doc["editorialgroup"]).each do |eg| + tc = eg.delete("technical_committee") + eg.merge!(tc) if tc + end + doc + end + + def update_links(bib, links) + links.each do |l| + tu = l.transform_keys(&:to_sym) + bib.link << RelatonBib::TypedUri.new(**tu) unless bib.url(l["type"]) + end + bib + end + + # + # Wrap into Array if not Array + # + # @param [Array, Hash, String, nil] content + # + # @return [Array<Hash, String>] + # + def array(content) + case content + when Array then content + when nil then [] + else [content] + end end end end end