# -*- ruby encoding: utf-8 -*- $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__) require 'open-uri' require 'nokogiri' require 'cgi' require 'fileutils' require 'yaml' ENV['RUBY_MIME_TYPES_LAZY_LOAD'] = 'yes' require 'mime/types' class IANADownloader INDEX_URL = %q(https://www.iana.org/assignments/media-types/) MIME_HREF = %r{/assignments/media-types/(.+)/?$} def self.download_to(destination) new(destination).download_all end attr_reader :destination def initialize(destination = nil) @destination = File.expand_path(destination || File.expand_path('../../type-lists', __FILE__)) end def download_all puts "Downloading index of MIME types from #{INDEX_URL}." index = Nokogiri::HTML(open(INDEX_URL) { |f| f.read }) index.xpath('//a').each do |tag| next unless tag['href'] href_match = MIME_HREF.match(tag['href']) next unless href_match href = href_match.captures.first next if tag.content == 'example' download_one(href, tag.content, href) end end def download_one(url, name = url, type = nil) if url =~ %r{^https?://} name = File.basename(url) if name == url else url = File.join(INDEX_URL, url) end Parser.download(name, from: url, to: @destination, type: type) end end class IANADownloader::Parser def self.download(name, options = {}) new(name, options) do |parser| parser.parse(parser.download) parser.save end end def initialize(name, options = {}) raise ArgumentError, ":from not specified" unless options[:from] raise ArgumentError, ":to not specified" unless options[:to] @name = "#{File.basename(name, '.yml')}.yml" @from = options[:from] @to = File.expand_path(options[:to]) @type = File.basename(options[:type] || name, '.yml') @file = File.join(@to, @name) @types = load_mime_types || MIME::Types.new yield self if block_given? end def download puts "Downloading #{@name} from #{@from}" Nokogiri::HTML(open(@from) { |f| f.read }) end def parse(html) nodes = html.xpath('//table//table//tr') # How many children does the first node have? node_count = child_elems(nodes.first).size if node_count == 1 # The title node doesn't have what we expect. Let's try it based on # the first real node. node_count = child_elems(nodes.first.next).size end nodes.each do |node| next if node == nodes.first elems = child_elems(node) next if elems.size.zero? if elems.size != node_count warn "size mismatch (#{elems.size} != #{node_count}) in node: #{node}" next end sub_ix, ref_ix = case elems.size when 3 [ 1, 2 ] when 4 [ 1, 3 ] else warn "size error (#{elems.size} != {3,4}) in node: #{node}" raise end subtype = elems[sub_ix].content.chomp.strip refs = child_elems(elems[ref_ix]).map { |ref| ref = ref.xpath('a') unless ref.name == 'a' [ ref ].flatten.map { |r| href_to_ref(r) } }.flatten content_type = [ @type, subtype].join('/') use_instead = nil obsolete = false if content_type =~ OBSOLETE content_type = $1 obsolete = true elsif content_type =~ DEPRECATED content_type = $1 use_instead = [ $2 ] obsolete = true end types = @types.select { |t| (t.content_type == content_type) } if types.empty? MIME::Type.new(content_type) do |mt| mt.references = %w(IANA) + refs mt.registered = true mt.obsolete = obsolete if obsolete mt.use_instead = use_instead if use_instead @types << mt end else types.each { |mt| mt.references = %w(IANA) + refs mt.registered = true mt.obsolete = obsolete if obsolete mt.use_instead = use_instead if use_instead } end end end def save FileUtils.mkdir_p(@to) File.open(@file, 'wb') { |f| f.puts @types.map.to_a.sort.to_yaml } end private def child_elems(node) node.children.select { |n| n.elem? } end def load_mime_types if File.exist?(@file) MIME::Types::Loader.load_from_yaml(@file) end end def href_to_ref(ref) case ref['href'] when CONTACT_PEOPLE tag = CGI::unescape($1).chomp.strip if tag == ref.content "[#{ref.content}]" else "[#{ref.content}=#{tag}]" end when RFC_EDITOR, IETF_RFC, IETF_RFC_TOOLS "RFC#$1" when RFC_BAD_EDITOR ref.content when %r{(https?://.*)} "{#{ref.content}=#$1}" else ref end end CONTACT_PEOPLE = %r{https?://www.iana.org/assignments/contact-people.html?l?#(.*)} RFC_EDITOR = %r{https?://www.rfc-editor.org/rfc/rfc(\d+).txt} RFC_BAD_EDITOR = %r{https?://www.rfc-editor.org/rfc/rfcxxxx.txt} IETF_RFC = %r{https?://www.ietf.org/rfc/rfc(\d+).txt} IETF_RFC_TOOLS = %r{https?://tools.ietf.org/html/rfc(\d+)} OBSOLETE = %r{(.+)\s+$(?:obsolete|deprecated)$}i DEPRECATED = %r{(.+)\s+-\s+DEPRECATED\s+-\s+Please\s+use\s+(.+)} end