lib/rfcbib/scrapper.rb in rfcbib-0.3.0 vs lib/rfcbib/scrapper.rb in rfcbib-0.3.1

- old
+ new

@@ -7,21 +7,41 @@ module RfcBib # rubocop:disable Metrics/ModuleLength # Scrapper module module Scrapper + + RFC_URI_PATTERN = "https://www.rfc-editor.org/refs/bibxml/reference.CODE" + ID_URI_PATTERN = "https://xml2rfc.tools.ietf.org/public/rfc/bibxml-ids/reference.CODE" + class << self # @param text [String] # @return [IsoBibItem::BibliographicItem] def scrape_page(text) - ref = text.sub(' ', '.') + '.xml' - if text =~ /^RFC/ - uri = URI("https://www.rfc-editor.org/refs/bibxml/reference.#{ref}") - elsif text =~ /^I-D/ - uri = URI("https://xml2rfc.tools.ietf.org/public/rfc/bibxml-ids/reference.#{ref}") + + # Remove initial "IETF " string if specified + ref = text. + gsub(/^IETF /, ""). + sub(' ', '.') + '.xml' + + uri = case ref + when /^RFC/ + RFC_URI_PATTERN.dup + when /^I-D/ + ID_URI_PATTERN.dup + else + warn "#{ref}: not recognised for RFC" + return end - doc = Nokogiri::HTML Net::HTTP.get(uri) + + uri = uri.gsub("CODE", ref) + res = Net::HTTP.get_response(URI(uri)) + if res.code != "200" + warn "No document found at #{uri}" + return + end + doc = Nokogiri::HTML Net::HTTP.get(URI(uri)) @reference = doc.at('//reference') return unless @reference bib_item end @@ -156,10 +176,10 @@ # # @return [Array<IsoBibItem::BibliographicDate>] published data. # def dates return unless (date = @reference.at '//front/date') - d = [date[:year], month(date[:month]), + d = [date[:year], month(date[:month]), (date[:day] || "01")].compact.join '-' date = Time.parse(d).strftime '%Y-%m-%d' [IsoBibItem::BibliographicDate.new(type: 'published', on: date)] end