lib/rfcbib/scrapper.rb in rfcbib-0.3.0 vs lib/rfcbib/scrapper.rb in rfcbib-0.3.1
- old
+ new
@@ -7,21 +7,41 @@
module RfcBib
# rubocop:disable Metrics/ModuleLength
# Scrapper module
module Scrapper
+
+ RFC_URI_PATTERN = "https://www.rfc-editor.org/refs/bibxml/reference.CODE"
+ ID_URI_PATTERN = "https://xml2rfc.tools.ietf.org/public/rfc/bibxml-ids/reference.CODE"
+
class << self
# @param text [String]
# @return [IsoBibItem::BibliographicItem]
def scrape_page(text)
- ref = text.sub(' ', '.') + '.xml'
- if text =~ /^RFC/
- uri = URI("https://www.rfc-editor.org/refs/bibxml/reference.#{ref}")
- elsif text =~ /^I-D/
- uri = URI("https://xml2rfc.tools.ietf.org/public/rfc/bibxml-ids/reference.#{ref}")
+
+ # Remove initial "IETF " string if specified
+ ref = text.
+ gsub(/^IETF /, "").
+ sub(' ', '.') + '.xml'
+
+ uri = case ref
+ when /^RFC/
+ RFC_URI_PATTERN.dup
+ when /^I-D/
+ ID_URI_PATTERN.dup
+ else
+ warn "#{ref}: not recognised for RFC"
+ return
end
- doc = Nokogiri::HTML Net::HTTP.get(uri)
+
+ uri = uri.gsub("CODE", ref)
+ res = Net::HTTP.get_response(URI(uri))
+ if res.code != "200"
+ warn "No document found at #{uri}"
+ return
+ end
+ doc = Nokogiri::HTML Net::HTTP.get(URI(uri))
@reference = doc.at('//reference')
return unless @reference
bib_item
end
@@ -156,10 +176,10 @@
#
# @return [Array<IsoBibItem::BibliographicDate>] published data.
#
def dates
return unless (date = @reference.at '//front/date')
- d = [date[:year], month(date[:month]),
+ d = [date[:year], month(date[:month]),
(date[:day] || "01")].compact.join '-'
date = Time.parse(d).strftime '%Y-%m-%d'
[IsoBibItem::BibliographicDate.new(type: 'published', on: date)]
end