# frozen_string_literal: true require "nokogiri" require "net/http" module RelatonItu # Scrapper. module Scrapper ROMAN_MONTHS = %w[I II III IV V VI VII VIII IX X XI XII].freeze TYPES = { "ISO" => "international-standard", "TS" => "technicalSpecification", "TR" => "technicalReport", "PAS" => "publiclyAvailableSpecification", "AWI" => "appruvedWorkItem", "CD" => "committeeDraft", "FDIS" => "finalDraftInternationalStandard", "NP" => "newProposal", "DIS" => "draftInternationalStandard", "WD" => "workingDraft", "R" => "recommendation", "Guide" => "guide", }.freeze class << self # Parse page. # @param hit [RelatonItu::Hit] # @return [Hash] def parse_page(hit, imp = false) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength doc = get_page hit return unless doc.code == "200" if imp a = doc.at "//span[contains(@id, 'tab_ig_uc_rec')]/a" return unless a doc = get_page hit, a[:href].to_s end # Fetch edition. edition = doc.at("//table/tr/td/span[contains(@id, 'Label8')]/b")&.text ItuBibliographicItem.new( fetched: Date.today.to_s, type: "standard", docid: fetch_docid(doc, hit.hit[:title]), edition: edition, language: ["en"], script: ["Latn"], title: fetch_titles(doc), doctype: hit.hit[:type], docstatus: fetch_status(doc), ics: [], # fetch_ics(doc), date: fetch_dates(doc), contributor: fetch_contributors(hit.hit[:code]), editorialgroup: fetch_workgroup(hit.hit[:code], doc), abstract: fetch_abstract(doc, hit), copyright: fetch_copyright(hit.hit[:code], doc), link: fetch_link(doc), relation: fetch_relations(doc), place: ["Geneva"], ) end private # Fetch abstracts. # @param doc [Mechanize::Page] # @param hit [RelatonItu::Hit] # @return [Array] def fetch_abstract(doc, hit) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength abstract_url = doc.at '//table/tr[td/span[.="In force"]]/td/span[contains(@id, "lbl_dms")]/div' content = if abstract_url url = abstract_url[:onclick].match(/https?[^']+/).to_s rsp = hit.hit_collection.agent.get url d = Nokogiri::HTML rsp.body.encode(undef: :replace, replace: "") d.css("p.MsoNormal").text.gsub(/\r\n/, "").squeeze(" ").gsub(/\u00a0/, "") elsif a = doc.at('//table/tr/td/span[contains(@class, "observation")]/text()') a.text.strip end return [] unless content [{ content: content, language: "en", script: "Latn", }] rescue Mechanize::ResponseCodeError => e warn "HTTP Service Unavailable: #{e.message}" [] end # Get page. # @param hit [RelatonItu::Hit] # @param url [String, nil] # @return [Array] def get_page(hit, url = nil) uri = url || hit.hit[:url] hit.hit_collection.agent.get uri rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, OpenSSL::SSL::SSLError raise RelatonBib::RequestError, "Could not access #{uri}" end # Fetch docid. # @param doc [Mechanize::Page] # @param title [String] # @return [Hash] def fetch_docid(doc, title) docids = doc.xpath( "//span[@id='ctl00_content_main_uc_rec_main_info1_rpt_main_ctl00_lbl_rec']", "//td[.='Identical standard:']/following-sibling::td", "//div/table[1]/tr[4]/td/strong", ).map { |c| createdocid c.text } docids << createdocid(title) unless docids.any? docids end # @param text [String] # @return [RelatonBib::DocumentIdentifier] def createdocid(text) # rubocop:disable Metrics/MethodLength %r{ ^(?

(?:(?:ITU-\w|ISO/IEC)\s)?[^(:]+)
          (?:\((?:(?<_month>\d{2})/)?(?<_year>\d{4})\))?
          (?::[^(]+\((?\d{2}\.\w{1,4}\.\d{4})\))?
          (?:\s(?(?:Amd|Cor)\.\s?\d+))?
          # (\s\(((?<_cormonth>\d{2})\/)?(?<_coryear>\d{4})\))?
        }x =~ text.squeeze(" ")
        corr&.sub!(/\.\s?/, " ")
        id = [code.sub(/[[:space:]]$/, ""), corr].compact.join " "
        id += " - #{buldate}" if buldate
        type = id.match(%r{^\w+}).to_s
        type = "ITU" if type == "G"
        RelatonBib::DocumentIdentifier.new(type: type, id: id, primary: true)
      end

      # Fetch status.
      # @param doc [Mechanize::Page]
      # @return [RelatonBib::DocumentStatus, NilClass]
      def fetch_status(doc)
        s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]",
                   "//p[contains(.,'Status :')]")
        return unless s

        status = s.text.include?("In force") ? "Published" : "Withdrawal"
        RelatonBib::DocumentStatus.new(stage: status)
      end

      # Fetch workgroup.
      # @param code [String]
      # @param doc [Mechanize::Page]
      # @return [RelatonItu::EditorialGroup, NilClass]
      def fetch_workgroup(code, doc)
        wg = doc.at('//table/tr/td/span[contains(@id, "Label8")]/a')
        # return unless wg

        group = wg && itugroup(wg.text)
        EditorialGroup.new(
          bureau: code.match(/(?<=-)./).to_s, group: group
        )
      end

      # @param name [String]
      # @return [RelatonItu::ItuGroup]
      def itugroup(name) # rubocop:disable Metrics/MethodLength
        if name.include? "Study Group"
          type = "study-group"
          acronym = "SG"
        elsif name.include? "Telecommunication Standardization Advisory Group"
          type = "tsag"
          acronym = "TSAG"
        else
          type = "work-group"
          acronym = "WG"
        end
        ItuGroup.new name: name, type: type, acronym: acronym
      end

      # Fetch relations.
      # @param doc [Mechanize::Page]
      # @return [Array]
      def fetch_relations(doc)
        doc.xpath('//div[contains(@id, "tab_sup")]//table/tr[position()>2]')
          .map do |r|
          ref = r.at('./td/span[contains(@id, "title_e")]/nobr/a')
          fref = RelatonBib::FormattedRef.new(content: ref.text, language: "en",
                                              script: "Latn")
          bibitem = ItuBibliographicItem.new(formattedref: fref,
                                             type: "standard")
          { type: "complements", bibitem: bibitem }
        end
      end

      # Fetch titles.
      # @param doc [Mechanize::Page]
      # @return [RelatonBib::TypedTitleStringCollection]
      def fetch_titles(doc)
        t = doc.at("//td[@class='title']|//div/table[1]/tr[4]/td/strong")
        return [] unless t

        RelatonBib::TypedTitleString.from_string t.text, "en", "Latn"
      end

      # Fetch dates
      # @param doc [Mechanize::Page]
      # @return [Array]
      def fetch_dates(doc) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
        dates = []
        date = doc.at("//table/tr/td/span[contains(@id, 'Label5')]",
                      "//p[contains(.,'Approved in')]")
        pdate = date&.text&.match(/\d{4}-\d{2}-\d{2}/).to_s || ob_date(doc)
        if pdate && !pdate&.empty?
          dates << { type: "published", on: pdate }
        elsif pdate = ob_date(doc)
          dates << { type: "published", on: pdate }
        end
        dates
      end

      # Scrape Operational Bulletin date.
      # @param doc [Mechanize::Page]
      # @return [String]
      def ob_date(doc)
        pdate = doc.at('//table/tbody/tr/td[contains(text(), "Year:")]')
        return unless pdate

        roman_to_arabic pdate.text.match(%r{(?<=Year: )(\d{2}.\w+.)?\d{4}}).to_s
      end

      # Convert roman month number in string date to arabic number
      # @param date [String]
      # @return [String]
      def roman_to_arabic(date)
        %r{(?[IVX]+)} =~ date
        if ROMAN_MONTHS.index(rmonth)
          month = ROMAN_MONTHS.index(rmonth) + 1
          Date.parse(date.sub(%r{[IVX]+}, month.to_s)).to_s
        else date
        end
      end

      # Fetch contributors
      # @param doc [Mechanize::Page]
      # @return [Array]
      def fetch_contributors(code)
        return [] unless code

        abbrev = code.sub(/-\w\s.*/, "")
        case abbrev
        when "ITU"
          name = "International Telecommunication Union"
          url = "www.itu.int"
        end
        [{ entity: { name: name, url: url, abbreviation: abbrev },
           role: [type: "publisher"] }]
      end

      # Fetch links.
      # @param doc [Mechanize::Page]
      # @return [Array]
      def fetch_link(doc)
        links = [{ type: "src", content: doc.uri.to_s }]
        obp_elm = doc.at(
          '//a[@title="Persistent link to download the PDF file"]',
          "//font[contains(.,'PDF')]/../..",
        )
        links << typed_link("obp", obp_elm) if obp_elm
        wrd_elm = doc.at("//font[contains(.,'Word')]/../..")
        links << typed_link("word", wrd_elm) if wrd_elm
        links
      end

      # @param type [String]
      # @param elm [Nokogiri::XML::Element]
      def typed_link(type, elm)
        {
          type: type,
          content: URI.join(HitCollection::DOMAIN, elm[:href].strip).to_s,
        }
      end

      # Fetch copyright.
      # @param code [String]
      # @param doc [Mechanize::Page]
      # @return [Array]
      def fetch_copyright(code, doc)
        abbreviation = code.match(/^[^-]+/).to_s
        case abbreviation
        when "ITU"
          name = "International Telecommunication Union"
          url = "www.itu.int"
        end
        fdate = doc.at("//table/tr/td/span[contains(@id, 'Label5')]")
        from = fdate&.text || ob_date(doc)
        [{ owner: [{ name: name, abbreviation: abbreviation, url: url }],
           from: from }]
      end
    end
  end
end