module RelatonIso # Fetch all the documents from ISO website. class DataFetcher # # Initialize data fetcher. # # @param [String] output output directory # @param [String] format format of output files (yaml, bibxml, xml) # def initialize(output, format) @output = output @format = format @ext = format.sub(/^bib/, "") @files = [] @queue = ::Queue.new @mutex = Mutex.new end def index @index ||= Relaton::Index.find_or_create :iso, file: HitCollection::INDEXFILE end def iso_queue @iso_queue ||= RelatonIso::Queue.new end # # Initialize data fetcher and fetch data. # # @param [String] output output directory (default: "data") # @param [String] format format of output files. Allowed: yaml (default), bibxml, xml # # @return [void] # def self.fetch(output: "data", format: "yaml") t1 = Time.now puts "Started at: #{t1}" FileUtils.mkdir_p output new(output, format).fetch t2 = Time.now puts "Stopped at: #{t2}" puts "Done in: #{(t2 - t1).round} sec." end # # Go through all ICS and fetch all documents. # # @return [void] # def fetch # rubocop:disable Metrics/AbcSize puts "Scrapping ICS pages..." fetch_ics puts "[#{Time.now}] Scrapping documents..." fetch_docs iso_queue.save # index.sort! { |a, b| compare_docids a, b } index.save end # # Fetch ICS page recursively and store all the links to documents in the iso_queue. # # @param [String] path path to ICS page # def fetch_ics threads = Array.new(3) { thread { |path| fetch_ics_page(path) } } fetch_ics_page "/standards-catalogue/browse-by-ics.html" sleep(1) until @queue.empty? threads.size.times { @queue << :END } threads.each(&:join) end def fetch_ics_page(path) resp = get_redirection path page = Nokogiri::HTML(resp.body) page.xpath("//td[@data-title='Standard and/or project']/div/div/a").each do |item| iso_queue.add_first item[:href].split("?").first end page.xpath("//td[@data-title='ICS']/a").each do |item| @queue << item[:href] end end # # Get the page from the given path. If the page is redirected, get the # page from the new path. # # @param [String] path path to the page # # @return [Net::HTTPOK] HTTP response # def get_redirection(path) # rubocop:disable Metrics/MethodLength try = 0 uri = URI(Scrapper::DOMAIN + path) begin get_response uri rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNREFUSED => e try += 1 retry if check_try try, uri Util.error "Error fetching #{uri}, #{e.message}" end end def get_response(uri) resp = Net::HTTP.get_response(uri) resp.code == "302" ? get_redirection(resp["location"]) : resp end def check_try(try, uri) if try < 3 Util.warn "Timeout fetching #{uri}, retrying..." sleep 1 true end end def fetch_docs threads = Array.new(3) { thread { |path| fetch_doc(path) } } iso_queue[0..10_000].each { |docpath| @queue << docpath } threads.size.times { @queue << :END } threads.each(&:join) end # # Fetch document from ISO website. # # @param [String] docpath document page path # # @return [void] # def fetch_doc(docpath) # path = docpath.sub(/\.html$/, "") # hit = Hit.new({ path: docpath }, nil) doc = Scrapper.parse_page docpath @mutex.synchronize { save_doc doc, docpath } rescue StandardError => e Util.error "Error fetching document: #{Scrapper::DOMAIN}#{docpath}\n" \ "#{e.message}\n#{e.backtrace}" end # def compare_docids(id1, id2) # Pubid::Iso::Identifier.create(**id1).to_s <=> Pubid::Iso::Identifier.create(**id2).to_s # end # # save document to file. # # @param [RelatonIsoBib::IsoBibliographicItem] doc document # # @return [void] # def save_doc(doc, docpath) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength docid = doc.docidentifier.detect(&:primary) file_name = docid.id.gsub(/[\s\/:]+/, "-").downcase file = File.join @output, "#{file_name}.#{@ext}" if @files.include? file Util.warn "Duplicate file #{file} for #{docid.id} from #{Scrapper::DOMAIN}#{docpath}" else @files << file index.add_or_update docid.to_h, file File.write file, serialize(doc), encoding: "UTF-8" end iso_queue.move_last docpath end # # Serialize document to string. # # @param [RelatonIsoBib::IsoBibliographicItem] doc document # # @return [String] serialized document # def serialize(doc) case @format when "yaml" then doc.to_hash.to_yaml when "bibxml" then doc.to_bibxml when "xml" then doc.to_xml bibdata: true end end private # # Create thread worker # # @return [Thread] thread # def thread Thread.new do while (path = @queue.pop) != :END yield path end end end end end