# frozen_string_literal: true require "nokogiri" require "net/http" module RelatonItu # Scrapper. module Scrapper ROMAN_MONTHS = %w[I II III IV V VI VII VIII IX X XI XII].freeze TYPES = { "ISO" => "international-standard", "TS" => "technicalSpecification", "TR" => "technicalReport", "PAS" => "publiclyAvailableSpecification", "AWI" => "appruvedWorkItem", "CD" => "committeeDraft", "FDIS" => "finalDraftInternationalStandard", "NP" => "newProposal", "DIS" => "draftInternationalStandard", "WD" => "workingDraft", "R" => "recommendation", "Guide" => "guide", }.freeze class << self # Parse page. # @param hit [RelatonItu::Hit] # @return [Hash] def parse_page(hit, imp = false) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength doc = get_page hit return unless doc.code == "200" if imp a = doc.at "//span[contains(@id, 'tab_ig_uc_rec')]/a" return unless a doc = get_page hit, a[:href].to_s end # Fetch edition. edition = doc.at("//table/tr/td/span[contains(@id, 'Label8')]/b")&.text ItuBibliographicItem.new( fetched: Date.today.to_s, type: "standard", docid: fetch_docid(doc, hit.hit[:title]), edition: edition, language: ["en"], script: ["Latn"], title: fetch_titles(doc), doctype: hit.hit[:type], docstatus: fetch_status(doc), ics: [], # fetch_ics(doc), date: fetch_dates(doc), contributor: fetch_contributors(hit.hit[:code]), editorialgroup: fetch_workgroup(hit.hit[:code], doc), abstract: fetch_abstract(doc, hit), copyright: fetch_copyright(hit.hit[:code], doc), link: fetch_link(doc), relation: fetch_relations(doc), place: ["Geneva"], ) end private # Fetch abstracts. # @param doc [Mechanize::Page] # @param hit [RelatonItu::Hit] # @return [Array] def fetch_abstract(doc, hit) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength abstract_url = doc.at '//table/tr[td/span[.="In force"]]/td/span[contains(@id, "lbl_dms")]/div' content = if abstract_url url = abstract_url[:onclick].match(/https?[^']+/).to_s rsp = hit.hit_collection.agent.get url d = Nokogiri::HTML rsp.body.encode(undef: :replace, replace: "") d.css("p.MsoNormal").text.gsub(/\r\n/, "").squeeze(" ").gsub(/\u00a0/, "") elsif a = doc.at('//table/tr/td/span[contains(@class, "observation")]/text()') a.text.strip end return [] unless content [{ content: content, language: "en", script: "Latn", }] rescue Mechanize::ResponseCodeError => e warn "HTTP Service Unavailable: #{e.message}" [] end # Get page. # @param hit [RelatonItu::Hit] # @param url [String, nil] # @return [Array] def get_page(hit, url = nil) uri = url || hit.hit[:url] hit.hit_collection.agent.get uri rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, OpenSSL::SSL::SSLError raise RelatonBib::RequestError, "Could not access #{uri}" end # Fetch docid. # @param doc [Mechanize::Page] # @param title [String] # @return [Hash] def fetch_docid(doc, title) docids = doc.xpath( "//span[@id='ctl00_content_main_uc_rec_main_info1_rpt_main_ctl00_lbl_rec']", "//td[.='Identical standard:']/following-sibling::td", "//div/table[1]/tr[4]/td/strong", ).map { |c| createdocid c.text } docids << createdocid(title) unless docids.any? docids end # @param text [String] # @return [RelatonBib::DocumentIdentifier] def createdocid(text) # rubocop:disable Metrics/MethodLength %r{ ^(?(?:(?:ITU-\w|ISO/IEC)\s)?[^(:]+) (?:\((?:(?<_month>\d{2})/)?(?<_year>\d{4})\))? (?::[^(]+\((?\d{2}\.\w{1,4}\.\d{4})\))? (?:\s(?(?:Amd|Cor)\.\s?\d+))? # (\s\(((?<_cormonth>\d{2})\/)?(?<_coryear>\d{4})\))? }x =~ text.squeeze(" ") corr&.sub!(/\.\s?/, " ") id = [code.sub(/[[:space:]]$/, ""), corr].compact.join " " id += " - #{buldate}" if buldate type = id.match(%r{^\w+}).to_s type = "ITU" if type == "G" RelatonBib::DocumentIdentifier.new(type: type, id: id, primary: true) end # Fetch status. # @param doc [Mechanize::Page] # @return [RelatonBib::DocumentStatus, NilClass] def fetch_status(doc) s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]", "//p[contains(.,'Status :')]") return unless s status = s.text.include?("In force") ? "Published" : "Withdrawal" RelatonBib::DocumentStatus.new(stage: status) end # Fetch workgroup. # @param code [String] # @param doc [Mechanize::Page] # @return [RelatonItu::EditorialGroup, NilClass] def fetch_workgroup(code, doc) wg = doc.at('//table/tr/td/span[contains(@id, "Label8")]/a') # return unless wg group = wg && itugroup(wg.text) EditorialGroup.new( bureau: code.match(/(?<=-)./).to_s, group: group ) end # @param name [String] # @return [RelatonItu::ItuGroup] def itugroup(name) # rubocop:disable Metrics/MethodLength if name.include? "Study Group" type = "study-group" acronym = "SG" elsif name.include? "Telecommunication Standardization Advisory Group" type = "tsag" acronym = "TSAG" else type = "work-group" acronym = "WG" end ItuGroup.new name: name, type: type, acronym: acronym end # Fetch relations. # @param doc [Mechanize::Page] # @return [Array] def fetch_relations(doc) doc.xpath('//div[contains(@id, "tab_sup")]//table/tr[position()>2]') .map do |r| ref = r.at('./td/span[contains(@id, "title_e")]/nobr/a') fref = RelatonBib::FormattedRef.new(content: ref.text, language: "en", script: "Latn") bibitem = ItuBibliographicItem.new(formattedref: fref, type: "standard") { type: "complements", bibitem: bibitem } end end # Fetch titles. # @param doc [Mechanize::Page] # @return [RelatonBib::TypedTitleStringCollection] def fetch_titles(doc) t = doc.at("//td[@class='title']|//div/table[1]/tr[4]/td/strong") return [] unless t RelatonBib::TypedTitleString.from_string t.text, "en", "Latn" end # Fetch dates # @param doc [Mechanize::Page] # @return [Array] def fetch_dates(doc) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity dates = [] date = doc.at("//table/tr/td/span[contains(@id, 'Label5')]", "//p[contains(.,'Approved in')]") pdate = date&.text&.match(/\d{4}-\d{2}-\d{2}/).to_s || ob_date(doc) if pdate && !pdate&.empty? dates << { type: "published", on: pdate } elsif pdate = ob_date(doc) dates << { type: "published", on: pdate } end dates end # Scrape Operational Bulletin date. # @param doc [Mechanize::Page] # @return [String] def ob_date(doc) pdate = doc.at('//table/tbody/tr/td[contains(text(), "Year:")]') return unless pdate roman_to_arabic pdate.text.match(%r{(?<=Year: )(\d{2}.\w+.)?\d{4}}).to_s end # Convert roman month number in string date to arabic number # @param date [String] # @return [String] def roman_to_arabic(date) %r{(?[IVX]+)} =~ date if ROMAN_MONTHS.index(rmonth) month = ROMAN_MONTHS.index(rmonth) + 1 Date.parse(date.sub(%r{[IVX]+}, month.to_s)).to_s else date end end # Fetch contributors # @param doc [Mechanize::Page] # @return [Array] def fetch_contributors(code) return [] unless code abbrev = code.sub(/-\w\s.*/, "") case abbrev when "ITU" name = "International Telecommunication Union" url = "www.itu.int" end [{ entity: { name: name, url: url, abbreviation: abbrev }, role: [type: "publisher"] }] end # Fetch links. # @param doc [Mechanize::Page] # @return [Array] def fetch_link(doc) links = [{ type: "src", content: doc.uri.to_s }] obp_elm = doc.at( '//a[@title="Persistent link to download the PDF file"]', "//font[contains(.,'PDF')]/../..", ) links << typed_link("obp", obp_elm) if obp_elm wrd_elm = doc.at("//font[contains(.,'Word')]/../..") links << typed_link("word", wrd_elm) if wrd_elm links end # @param type [String] # @param elm [Nokogiri::XML::Element] def typed_link(type, elm) { type: type, content: URI.join(HitCollection::DOMAIN, elm[:href].strip).to_s, } end # Fetch copyright. # @param code [String] # @param doc [Mechanize::Page] # @return [Array] def fetch_copyright(code, doc) abbreviation = code.match(/^[^-]+/).to_s case abbreviation when "ITU" name = "International Telecommunication Union" url = "www.itu.int" end fdate = doc.at("//table/tr/td/span[contains(@id, 'Label5')]") from = fdate&.text || ob_date(doc) [{ owner: [{ name: name, abbreviation: abbreviation, url: url }], from: from }] end end end end