lib/relaton_iso/data_fetcher.rb in relaton-iso-1.19.1 vs lib/relaton_iso/data_fetcher.rb in relaton-iso-1.19.2

- old
+ new

@@ -5,17 +5,20 @@ # Initialize data fetcher. # # @param [String] output output directory # @param [String] format format of output files (yaml, bibxml, xml) # - def initialize(output, format) + def initialize(output, format) # rubocop:disable Metrics/AbcSize @output = output @format = format @ext = format.sub(/^bib/, "") - @files = [] + @files = Set.new @queue = ::Queue.new @mutex = Mutex.new + @gh_issue = Relaton::Logger::Channels::GhIssue.new "relaton/relaton-iso", "Error fetching ISO documents" + Relaton.logger_pool[:gh_issue] = Relaton::Logger::Log.new(@gh_issue, levels: [:error]) + @errors = Hash.new(true) end def index @index ||= Relaton::Index.find_or_create :iso, file: HitCollection::INDEXFILE end @@ -32,33 +35,41 @@ # # @return [void] # def self.fetch(output: "data", format: "yaml") t1 = Time.now - puts "Started at: #{t1}" + Util.info "Started at: #{t1}" FileUtils.mkdir_p output new(output, format).fetch t2 = Time.now - puts "Stopped at: #{t2}" - puts "Done in: #{(t2 - t1).round} sec." + Util.info "Stopped at: #{t2}" + Util.info "Done in: #{(t2 - t1).round} sec." end # # Go through all ICS and fetch all documents. # # @return [void] # def fetch # rubocop:disable Metrics/AbcSize - puts "Scrapping ICS pages..." + Util.info "Scrapping ICS pages..." fetch_ics - puts "[#{Time.now}] Scrapping documents..." + Util.info "(#{Time.now}) Scrapping documents..." fetch_docs iso_queue.save # index.sort! { |a, b| compare_docids a, b } index.save + repot_errors end + def repot_errors + @errors.select { |_, v| v }.each_key do |k| + Util.error "Failed to fetch #{k}" + end + @gh_issue.create_issue + end + # # Fetch ICS page recursively and store all the links to documents in the iso_queue. # # @param [String] path path to ICS page # @@ -70,38 +81,54 @@ threads.each(&:join) end def fetch_ics_page(path) resp = get_redirection path - page = Nokogiri::HTML(resp.body) - page.xpath("//td[@data-title='Standard and/or project']/div/div/a").each do |item| - iso_queue.add_first item[:href].split("?").first + unless resp + Util.error "Failed fetching ICS page #{url(path)}" + return end - page.xpath("//td[@data-title='ICS']/a").each do |item| - @queue << item[:href] - end + page = Nokogiri::HTML(resp.body) + parse_doc_links page + parse_ics_links page end + def parse_doc_links(page) + doc_links = page.xpath "//td[@data-title='Standard and/or project']/div/div/a" + @errors[:doc_links] &&= doc_links.empty? + doc_links.each { |item| iso_queue.add_first item[:href].split("?").first } + end + + def parse_ics_links(page) + ics_links = page.xpath("//td[@data-title='ICS']/a") + @errors[:ics_links] &&= ics_links.empty? + ics_links.each { |item| @queue << item[:href] } + end + + def url(path) + Scrapper::DOMAIN + path + end + # # Get the page from the given path. If the page is redirected, get the # page from the new path. # # @param [String] path path to the page # - # @return [Net::HTTPOK] HTTP response + # @return [Net::HTTPOK, nil] HTTP response # def get_redirection(path) # rubocop:disable Metrics/MethodLength try = 0 - uri = URI(Scrapper::DOMAIN + path) + uri = URI url(path) begin get_response uri rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNREFUSED => e try += 1 retry if check_try try, uri - Util.error "Error fetching #{uri}, #{e.message}" + Util.warn "Failed fetching #{uri}, #{e.message}" end end def get_response(uri) resp = Net::HTTP.get_response(uri) @@ -129,17 +156,14 @@ # @param [String] docpath document page path # # @return [void] # def fetch_doc(docpath) - # path = docpath.sub(/\.html$/, "") - # hit = Hit.new({ path: docpath }, nil) - doc = Scrapper.parse_page docpath + doc = Scrapper.parse_page docpath, errors: @errors @mutex.synchronize { save_doc doc, docpath } rescue StandardError => e - Util.error "Error fetching document: #{Scrapper::DOMAIN}#{docpath}\n" \ - "#{e.message}\n#{e.backtrace}" + Util.warn "Fail fetching document: #{url(docpath)}\n#{e.message}\n#{e.backtrace}" end # def compare_docids(id1, id2) # Pubid::Iso::Identifier.create(**id1).to_s <=> Pubid::Iso::Identifier.create(**id2).to_s # end @@ -153,17 +177,41 @@ # def save_doc(doc, docpath) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength docid = doc.docidentifier.detect(&:primary) file_name = docid.id.gsub(/[\s\/:]+/, "-").downcase file = File.join @output, "#{file_name}.#{@ext}" - if @files.include? file - Util.warn "Duplicate file #{file} for #{docid.id} from #{Scrapper::DOMAIN}#{docpath}" + if File.exist?(file) + rewrite_with_same_or_newer doc, docid, file, docpath else - @files << file - index.add_or_update docid.to_h, file - File.write file, serialize(doc), encoding: "UTF-8" + write_file file, doc, docid end iso_queue.move_last docpath + end + + def rewrite_with_same_or_newer(doc, docid, file, docpath) + hash = YAML.load_file file + item_hash = HashConverter.hash_to_bib hash + bib = ::RelatonIsoBib::IsoBibliographicItem.new(**item_hash) + if edition_greater?(doc, bib) || replace_substage98?(doc, bib) + write_file file, doc, docid + elsif @files.include?(file) && !edition_greater?(bib, doc) + Util.warn "Duplicate file `#{file}` for `#{docid.id}` from #{url(docpath)}" + end + end + + def edition_greater?(doc, bib) + doc.edition && bib.edition && doc.edition.content.to_i > bib.edition.content.to_i + end + + def replace_substage98?(doc, bib) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity + doc.edition&.content == bib.edition&.content && + (doc.status&.substage&.value != "98" || bib.status&.substage&.value == "98") + end + + def write_file(file, doc, docid) + @files << file + index.add_or_update docid.to_h, file + File.write file, serialize(doc), encoding: "UTF-8" end # # Serialize document to string. #