# require "rubygems" # require "rubygems/package" # require "zlib" require "relaton_ietf/rfc_index_entry" require "relaton_ietf/rfc_entry" module RelatonIetf class DataFetcher # # Data fetcher initializer # # @param [String] source source name # @param [String] output directory to save files # @param [String] format format of output files (xml, yaml, bibxml); # for ietf-rfcsubseries source only: xml # def initialize(source, output, format) @source = source @output = output @format = format @ext = @format.sub(/^bib|^rfc/, "") @files = [] end # # Initialize fetcher and run fetch # # @param [String] source source name # @param [Strin] output directory to save files, default: "data" # @param [Strin] format format of output files (xml, yaml, bibxml); # default: yaml; for ietf-rfcsubseries source only: xml # def self.fetch(source, output: "data", format: "yaml") t1 = Time.now puts "Started at: #{t1}" FileUtils.mkdir_p output # unless Dir.exist? output new(source, output, format).fetch t2 = Time.now puts "Stopped at: #{t2}" puts "Done in: #{(t2 - t1).round} sec." end # # Fetch documents # def fetch case @source when "ietf-rfcsubseries" then fetch_ieft_rfcsubseries when "ietf-internet-drafts" then fetch_ieft_internet_drafts when "ietf-rfc-entries" then fetch_ieft_rfcs end end # # Fetches ietf-rfcsubseries documents # def fetch_ieft_rfcsubseries rfc_index.xpath("xmlns:bcp-entry|xmlns:fyi-entry|xmlns:std-entry").each do |doc| save_doc RfcIndexEntry.parse(doc) end end # # Fetches ietf-internet-drafts documents # def fetch_ieft_internet_drafts # rubocop:disable Metrics/MethodLength, Metrics/AbcSize versions = Dir["bibxml-ids/*.xml"].each_with_object([]) do |path, vers| file = File.basename path, ".xml" if file.include?("D.draft-") vers << file.sub(/^reference\.I-D\./, "") /(?\d+)$/ =~ file end bib = BibXMLParser.parse(File.read(path, encoding: "UTF-8")) if ver version = RelatonBib::BibliographicItem::Version.new nil, ver bib.instance_variable_set :@version, [version] end save_doc bib end update_versions(versions) if versions.any? && @format != "bibxml" end # # Updates I-D's versions # # @param [Array] versions list of versions # def update_versions(versions) # rubocop:disable Metrics/MethodLength, Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity series = "" bib_versions = [] Dir["#{@output}/*.#{@ext}"].each do |file| match = /(?draft-.+)-(?\d{2})\.#{@ext}$/.match file if match if series != match[:series] bib_versions = versions.select { |ref| ref.include? match[:series] } create_series match[:series], bib_versions series = match[:series] end lv = bib_versions.select { |ref| ref.match(/\d+$/).to_s.to_i < match[:ver].to_i } hv = bib_versions.select { |ref| ref.match(/\d+$/).to_s.to_i > match[:ver].to_i } if lv.any? || hv.any? bib = read_doc(file) bib.relation << version_relation(lv.last, "updates") if lv.any? bib.relation << version_relation(hv.first, "updatedBy") if hv.any? save_doc bib, check_duplicate: false end end end end # # Create unversioned bibliographic item # # @param [String] ref reference # @param [Array] versions list of versions # def create_series(ref, versions) # rubocop:disable Metrics/AbcSize return if versions.size < 2 vs = versions.sort_by { |v| v.match(/\d+$/).to_s.to_i } fref = RelatonBib::FormattedRef.new content: ref docid = RelatonBib::DocumentIdentifier.new type: "Internet-Draft", id: ref, primary: true rel = vs.map { |v| version_relation v, "includes" } last_v = HashConverter.hash_to_bib YAML.load_file("#{@output}/#{vs.last}.#{@ext}") bib = IetfBibliographicItem.new( title: last_v[:title], abstract: last_v[:abstract], formattedref: fref, docid: [docid], relation: rel, ) save_doc bib end # # Create bibitem relation # # @param [String] ref reference # @param [String] type relation type # # @return [RelatonBib::DocumentRelation] relation # def version_relation(ref, type) fref = RelatonBib::FormattedRef.new content: ref docid = RelatonBib::DocumentIdentifier.new type: "Internet-Draft", id: ref, primary: true bibitem = IetfBibliographicItem.new formattedref: fref, docid: [docid] RelatonBib::DocumentRelation.new(type: type, bibitem: bibitem) end # # Redad saved documents # # @param [String] file path to file # # @return [RelatonIetf::IetfBibliographicItem] bibliographic item # def read_doc(file) doc = File.read(file, encoding: "UTF-8") case @format when "xml" then XMLParser.from_xml(doc) when "yaml" then IetfBibliographicItem.from_hash YAML.safe_load(doc) else BibXMLParser.parse(doc) end end # # Fetches ietf-rfc-entries documents # def fetch_ieft_rfcs rfc_index.xpath("xmlns:rfc-entry").each do |doc| save_doc RfcEntry.parse(doc) rescue StandardError => e warn "Error parsing #{doc.at('./xmlns:doc-id').text}: #{e.message}" warn e.backtrace[0..5].join("\n") end end # # Get RFC index # # @return [Nokogiri::XML::Document] RFC index # def rfc_index uri = URI "https://www.rfc-editor.org/rfc-index.xml" Nokogiri::XML(Net::HTTP.get(uri)).at("/xmlns:rfc-index") end # # Save document to file # # @param [RelatonIetf::RfcIndexEntry, nil] rfc index entry # @param [Boolean] check_duplicate check for duplicate # def save_doc(entry, check_duplicate: true) # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity return unless entry c = case @format when "xml" then entry.to_xml(bibdata: true) when "yaml" then entry.to_hash.to_yaml else entry.send("to_#{@format}") end file = file_name entry if check_duplicate && @files.include?(file) warn "File #{file} already exists. Document: #{entry.docnumber}" elsif check_duplicate @files << file end File.write file, c, encoding: "UTF-8" end # # Generate file name # # @param [RelatonIetf::RfcIndexEntry] entry # # @return [String] file name # def file_name(entry) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity id = if entry.respond_to? :docidentifier entry.docidentifier.detect { |i| i.type == "Internet-Draft" }&.id end id ||= entry.docnumber || entry.formattedref.content if @source == "ietf-internet-drafts" then id.downcase! else id.upcase! end name = id.gsub(/[\s,:\/]/, "_").squeeze("_") File.join @output, "#{name}.#{@ext}" end end end