module Indexer class Importer # Import metadata from a HTML source using microformats. # # NOTE: The implementation using css selectors is fairly slow. # If we even think it important to speed up then we might # try traversing instead. # module HTMLImportation # # YAML import procedure. # def import(source) if File.file?(source) case File.extname(source) when '.html' load_html(source) return true end end super(source) if defined?(super) end # # Import metadata from HTML file. # def load_html(file) require 'nokogiri' case file when Nokogiri::XML::Document doc = file when File doc = Nokogiri::HTML(file) else doc = Nokogiri::HTML(File.new(file)) end data = {} %w{version summary description created}.each do |field| load_html_simple(field, doc, data) end load_html_name(doc, data) load_html_title(doc, data) load_html_authors(doc, data) load_html_organizations(doc, data) load_html_requirements(doc, data) load_html_resources(doc, data) load_html_repositories(doc, data) load_html_copyrights(doc, data) load_html_categories(doc, data) metadata.merge!(data) end # # Load a simple field value. # def load_html_simple(field, doc, data) nodes = doc.css(".i#{field}") return if (nodes.nil? or nodes.empty?) text = nodes.first.content.strip data[field] = text end # # Load name, and use it for title too if not already set. # def load_html_name(doc, data) nodes = doc.css(".iname") return if (nodes.nil? or nodes.empty?) text = nodes.first.content.strip unless metadata.title data['title'] = text.capitalize end data['name'] = text end # # Load title, and use it for name too if not already set. # def load_html_title(doc, data) nodes = doc.css(".ititle") return if (nodes.nil? or nodes.empty?) text = nodes.first.content.strip unless metadata.name data['name'] = text.downcase.gsub(/\s+/, '_') end data['title'] = text end # # # def load_html_categories(doc, data) nodes = doc.css('.icategory') return if (nodes.nil? or nodes.empty?) data['categories'] ||= [] nodes.each do |node| entry = node.content.strip data['categories'] << entry unless entry == "" end end # # # def load_html_resources(doc, data) nodes = doc.css('.iresource') return if (nodes.nil? or nodes.empty?) data['resources'] ||= [] nodes.each do |node| entry = {} entry['uri'] = node.attr('href') entry['type'] = node.attr('name') || node.attr('title') # best choice for this? entry['label'] = node.content.strip data['resources'] << entry if entry['uri'] end end # # # def load_html_requirements(doc, data) nodes = doc.css('.irequirement') return if (nodes.nil? or nodes.empty?) data['requirements'] ||= [] nodes.each do |node| entry = {} if n = node.at_css('.name') entry['name'] = n.content.strip end if n = node.at_css('.version') entry['version'] = n.content.strip end if n = (node.at_css('.groups') || node.at_css('.group')) text = n.content.strip text = text.sub(/^[(]/, '').sub(/[)]$/, '').strip entry['groups'] = text.split(/\s+/) if %w{test build document development}.any?{ |g| entry['groups'].include?(g) } entry['development'] = true end end data['requirements'] << entry if entry['name'] end end # # Class is `iauthor`. # def load_html_authors(doc, data) nodes = doc.css('.iauthor') return if (nodes.nil? or nodes.empty?) data['authors'] ||= [] nodes.each do |node| entry = {} if n = (node.at_css('.name') || node.at_css('.nickname')) entry['name'] = n.content.strip end if n = node.at_css('.email') text = n.attr(:href) || n.content.strip text = text.sub(/^mailto\:/i, '') entry['email'] = text end if n = node.at_css('.website') || node.at_css('.uri') || node.at_css('.url') text = n.attr(:href) || n.content.strip entry['website'] = text end data['authors'] << entry if entry['name'] end end # # Class is `iorg`. # def load_html_organizations(doc, data) nodes = doc.css('.iorg') return if (nodes.nil? or nodes.empty?) data['organizations'] ||= [] nodes.each do |node| entry = {} if n = node.at_css('.name') entry['name'] = n.content.strip end if n = node.at_css('.email') text = n.attr(:href) || n.content.strip text = text.sub(/^mailto\:/i, '') entry['email'] = text end if n = node.at_css('.website') || node.at_css('.uri') || node.at_css('.url') text = n.attr(:href) || n.content.strip entry['website'] = text end data['organizations'] << entry if entry['name'] end end # # Class is `irepo`. # def load_html_repositories(doc, data) nodes = doc.css('.irepo') return if (nodes.nil? or nodes.empty?) data['repositories'] ||= [] nodes.each do |node| entry = {} entry['uri'] = node.attr('href') entry['type'] = node.attr('name') || node.attr('title') # best choice for this? entry['label'] = node.content.strip data['resources'] << entry if entry['uri'] end end # # # def load_html_copyrights(doc, data) nodes = doc.css('.icopyright') return if (nodes.nil? or nodes.empty?) data['copyrights'] ||= [] nodes.each do |node| entry = {} if n = node.at_css('.holder') entry['holder'] = n.content.strip end if n = node.at_css('.year') entry['year'] = n.content.strip end if n = node.at_css('.license') text = n.content.strip text = text.sub(/license$/i,'').strip entry['license'] = text end data['copyrights'] << entry end end end # Include YAMLImportation mixin into Builder class. include HTMLImportation end end