module RelatonIso
  # Fetch all the documents from ISO website.
  class DataFetcher
    #
    # Initialize data fetcher.
    #
    # @param [String] output output directory
    # @param [String] format format of output files (yaml, bibxml, xml)
    #
    def initialize(output, format) # rubocop:disable Metrics/AbcSize
      @output = output
      @format = format
      @ext = format.sub(/^bib/, "")
      @files = Set.new
      @queue = ::Queue.new
      @mutex = Mutex.new
      @gh_issue = Relaton::Logger::Channels::GhIssue.new "relaton/relaton-iso", "Error fetching ISO documents"
      Relaton.logger_pool[:gh_issue] = Relaton::Logger::Log.new(@gh_issue, levels: [:error])
      @errors = Hash.new(true)
    end

    def index
      @index ||= Relaton::Index.find_or_create :iso, file: HitCollection::INDEXFILE
    end

    def iso_queue
      @iso_queue ||= RelatonIso::Queue.new
    end

    #
    # Initialize data fetcher and fetch data.
    #
    # @param [String] output output directory (default: "data")
    # @param [String] format format of output files. Allowed: yaml (default), bibxml, xml
    #
    # @return [void]
    #
    def self.fetch(output: "data", format: "yaml")
      t1 = Time.now
      Util.info "Started at: #{t1}"
      FileUtils.mkdir_p output
      new(output, format).fetch
      t2 = Time.now
      Util.info "Stopped at: #{t2}"
      Util.info "Done in: #{(t2 - t1).round} sec."
    end

    #
    # Go through all ICS and fetch all documents.
    #
    # @return [void]
    #
    def fetch # rubocop:disable Metrics/AbcSize
      Util.info "Scrapping ICS pages..."
      fetch_ics
      Util.info "(#{Time.now}) Scrapping documents..."
      fetch_docs
      iso_queue.save
      # index.sort! { |a, b| compare_docids a, b }
      index.save
      repot_errors
    end

    def repot_errors
      @errors.select { |_, v| v }.each_key do |k|
        Util.error "Failed to fetch #{k}"
      end
      @gh_issue.create_issue
    end

    #
    # Fetch ICS page recursively and store all the links to documents in the iso_queue.
    #
    # @param [String] path path to ICS page
    #
    def fetch_ics
      threads = Array.new(3) { thread { |path| fetch_ics_page(path) } }
      fetch_ics_page "/standards-catalogue/browse-by-ics.html"
      sleep(1) until @queue.empty?
      threads.size.times { @queue << :END }
      threads.each(&:join)
    end

    def fetch_ics_page(path)
      resp = get_redirection path
      unless resp
        Util.error "Failed fetching ICS page #{url(path)}"
        return
      end

      page = Nokogiri::HTML(resp.body)
      parse_doc_links page
      parse_ics_links page
    end

    def parse_doc_links(page)
      doc_links = page.xpath "//td[@data-title='Standard and/or project']/div/div/a"
      @errors[:doc_links] &&= doc_links.empty?
      doc_links.each { |item| iso_queue.add_first item[:href].split("?").first }
    end

    def parse_ics_links(page)
      ics_links = page.xpath("//td[@data-title='ICS']/a")
      @errors[:ics_links] &&= ics_links.empty?
      ics_links.each { |item| @queue << item[:href] }
    end

    def url(path)
      Scrapper::DOMAIN + path
    end

    #
    # Get the page from the given path. If the page is redirected, get the
    # page from the new path.
    #
    # @param [String] path path to the page
    #
    # @return [Net::HTTPOK, nil] HTTP response
    #
    def get_redirection(path) # rubocop:disable Metrics/MethodLength
      try = 0
      uri = URI url(path)
      begin
        get_response uri
      rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNREFUSED => e
        try += 1
        retry if check_try try, uri

        Util.warn "Failed fetching #{uri}, #{e.message}"
      end
    end

    def get_response(uri)
      resp = Net::HTTP.get_response(uri)
      resp.code == "302" ? get_redirection(resp["location"]) : resp
    end

    def check_try(try, uri)
      if try < 3
        Util.warn "Timeout fetching #{uri}, retrying..."
        sleep 1
        true
      end
    end

    def fetch_docs
      threads = Array.new(3) { thread { |path| fetch_doc(path) } }
      iso_queue[0..10_000].each { |docpath| @queue << docpath }
      threads.size.times { @queue << :END }
      threads.each(&:join)
    end

    #
    # Fetch document from ISO website.
    #
    # @param [String] docpath document page path
    #
    # @return [void]
    #
    def fetch_doc(docpath)
      doc = Scrapper.parse_page docpath, errors: @errors
      @mutex.synchronize { save_doc doc, docpath }
    rescue StandardError => e
      Util.warn "Fail fetching document: #{url(docpath)}\n#{e.message}\n#{e.backtrace}"
    end

    # def compare_docids(id1, id2)
    #   Pubid::Iso::Identifier.create(**id1).to_s <=> Pubid::Iso::Identifier.create(**id2).to_s
    # end

    #
    # save document to file.
    #
    # @param [RelatonIsoBib::IsoBibliographicItem] doc document
    #
    # @return [void]
    #
    def save_doc(doc, docpath) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
      docid = doc.docidentifier.detect(&:primary)
      file_name = docid.id.gsub(/[\s\/:]+/, "-").downcase
      file = File.join @output, "#{file_name}.#{@ext}"
      if File.exist?(file)
        rewrite_with_same_or_newer doc, docid, file, docpath
      else
        write_file file, doc, docid
      end
      iso_queue.move_last docpath
    end

    def rewrite_with_same_or_newer(doc, docid, file, docpath)
      hash = YAML.load_file file
      item_hash = HashConverter.hash_to_bib hash
      bib = ::RelatonIsoBib::IsoBibliographicItem.new(**item_hash)
      if edition_greater?(doc, bib) || replace_substage98?(doc, bib)
        write_file file, doc, docid
      elsif @files.include?(file) && !edition_greater?(bib, doc)
        Util.warn "Duplicate file `#{file}` for `#{docid.id}` from #{url(docpath)}"
      end
    end

    def edition_greater?(doc, bib)
      doc.edition && bib.edition && doc.edition.content.to_i > bib.edition.content.to_i
    end

    def replace_substage98?(doc, bib) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
      doc.edition&.content == bib.edition&.content &&
        (doc.status&.substage&.value != "98" || bib.status&.substage&.value == "98")
    end

    def write_file(file, doc, docid)
      @files << file
      index.add_or_update docid.to_h, file
      File.write file, serialize(doc), encoding: "UTF-8"
    end

    #
    # Serialize document to string.
    #
    # @param [RelatonIsoBib::IsoBibliographicItem] doc document
    #
    # @return [String] serialized document
    #
    def serialize(doc)
      case @format
      when "yaml" then doc.to_hash.to_yaml
      when "bibxml" then doc.to_bibxml
      when "xml" then doc.to_xml bibdata: true
      end
    end

    private

    #
    # Create thread worker
    #
    # @return [Thread] thread
    #
    def thread
      Thread.new do
        while (path = @queue.pop) != :END
          yield path
        end
      end
    end
  end
end