lib/relaton_calconnect/scrapper.rb in relaton-calconnect-1.9.0 vs lib/relaton_calconnect/scrapper.rb in relaton-calconnect-1.9.1
- old
+ new
@@ -1,41 +1,96 @@
module RelatonCalconnect
module Scrapper
DOMAIN = "https://standards.calconnect.org/".freeze
+ SCHEME, HOST = DOMAIN.split(%r{:?/?/})
# DOMAIN = "http://127.0.0.1:4000/".freeze
class << self
# papam hit [Hash]
# @return [RelatonOgc::OrcBibliographicItem]
def parse_page(hit)
- link = hit["link"].detect { |l| l["type"] == "rxl" }
+ links = array(hit["link"])
+ link = links.detect { |l| l["type"] == "rxl" }
if link
- bib_xml = fetch_bib_xml link["content"]
- XMLParser.from_xml bib_xml
+ bib = fetch_bib_xml link["content"]
+ update_links bib, links
+ # XMLParser.from_xml bib_xml
+ else
+ bib = RelatonCalconnect::CcBibliographicItem.from_hash doc_to_hash(hit)
end
+ bib.link.each { |l| l.content.merge!(scheme: SCHEME, host: HOST) unless l.content.host }
+ bib
end
private
# @param url [String]
# @return [String] XML
def fetch_bib_xml(url)
+ # rxl = get_rxl url
+ # uri_rxl = rxl.at("uri[@type='rxl']")
+ # return rxl.to_xml unless uri_rxl
+
+ # uri_xml = rxl.xpath("//uri").to_xml
+ # rxl = get_rxl uri_rxl.text
+ # docid = rxl.at "//docidentifier"
+ # docid.add_previous_sibling uri_xml
+ # rxl.to_xml
rxl = get_rxl url
uri_rxl = rxl.at("uri[@type='rxl']")
- return rxl.to_xml unless uri_rxl
-
- uri_xml = rxl.xpath("//uri").to_xml
- rxl = get_rxl uri_rxl.text
- docid = rxl.at "//docidentifier"
- docid.add_previous_sibling uri_xml
- rxl.to_xml
+ if uri_rxl
+ uri_xml = rxl.xpath("//uri").to_xml
+ rxl = get_rxl uri_rxl.text
+ docid = rxl.at "//docidentifier"
+ docid.add_previous_sibling uri_xml
+ end
+ xml = rxl.to_xml.gsub!(%r{(</?)technical-committee(>)}, '\1committee\2')
+ RelatonCalconnect::XMLParser.from_xml xml
end
# @param path [String]
# @return [Nokogiri::XML::Document]
def get_rxl(path)
resp = Faraday.get DOMAIN + path
Nokogiri::XML resp.body
+ end
+
+ #
+ # Fix editorial group
+ #
+ # @param [Hash] doc
+ #
+ # @return [Hash]
+ #
+ def doc_to_hash(doc)
+ array(doc["editorialgroup"]).each do |eg|
+ tc = eg.delete("technical_committee")
+ eg.merge!(tc) if tc
+ end
+ doc
+ end
+
+ def update_links(bib, links)
+ links.each do |l|
+ tu = l.transform_keys(&:to_sym)
+ bib.link << RelatonBib::TypedUri.new(**tu) unless bib.url(l["type"])
+ end
+ bib
+ end
+
+ #
+ # Wrap into Array if not Array
+ #
+ # @param [Array, Hash, String, nil] content
+ #
+ # @return [Array<Hash, String>]
+ #
+ def array(content)
+ case content
+ when Array then content
+ when nil then []
+ else [content]
+ end
end
end
end
end