Sha256: 96f966cb6965370303db6b83e8ccd9510783ffe110abdf71081b34bbbe21a54d
Contents?: true
Size: 1.89 KB
Versions: 5
Compression:
Stored size: 1.89 KB
Contents
# encoding: UTF-8 # frozen_string_literal: true require "open-uri" require "nokogiri" require "relaton_gb/scrapper" require "relaton_gb/gb_bibliographic_item" module RelatonGb # National standard scrapper. module GbScrapper extend Scrapper SEARCH_URL = "https://openstd.samr.gov.cn/bzgk/gb/std_list" DOC_URL = "http://openstd.samr.gov.cn/bzgk/gb/newGbInfo?hcno=" class << self # @param text [Strin] code of standard for serarch # @return [RelatonGb::HitCollection] def scrape_page(text) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize doc = agent.get("#{SEARCH_URL}?p.p2=#{CGI.escape(text)}") hits = doc.xpath( "//table[contains(@class, 'result_list')]/tbody[2]/tr", ).map do |h| ref = h.at "./td[2]/a" pid = ref[:onclick].match(/[0-9A-F]+/).to_s rdate = h.at("./td[7]").text Hit.new pid: pid, docref: ref.text, scrapper: self, release_date: rdate end HitCollection.new hits.sort_by(&:release_date).reverse rescue Mechanize::Error => e raise RelatonBib::RequestError, e.message end def agent @agent ||= Mechanize.new end # @param hit [RelatonGb::Hit] standard's page id # @return [RelatonGb::GbBibliographicItem] def scrape_doc(hit) src = DOC_URL + hit.pid doc = agent.get src GbBibliographicItem.new(**scrapped_data(doc, src, hit)) rescue Mechanize::Error => e raise RelatonBib::RequestError, e.message end # @param doc [Nokogiri::HTML] # @param _ref [String] # @return [Hash] # * :type [String] # * :name [String] def get_committee(doc, _ref) name = doc.at("//div[contains(., '归口单位') or contains(., '归口部门')]/following-sibling::div") { type: "technical", name: name.text.delete("\r\n\t\t") } end end end end
Version data entries
5 entries across 5 versions & 1 rubygems