# -*- encoding: utf-8 -*- require 'nokogiri' require 'hashie/rash' module MetaInspector # Parses the document with Nokogiri class Parser include MetaInspector::Exceptionable def initialize(document, options = {}) options = defaults.merge(options) @document = document @data = Hashie::Rash.new @exception_log = options[:exception_log] end extend Forwardable def_delegators :@document, :url, :scheme, :host # Returns the whole parsed document def parsed @parsed ||= Nokogiri::HTML(@document.to_s) rescue Exception => e @exception_log << e end def to_hash scrape_meta_data @data.to_hash end # Returns the parsed document title, from the content of the tag. # This is not the same as the meta_title tag def title @title ||= parsed.css('title').inner_text rescue nil end # A description getter that first checks for a meta description and if not present will # guess by looking at the first paragraph with more than 120 characters def description meta_description || secondary_description end # Links found on the page, as absolute URLs def links @links ||= parsed_links.map{ |l| URL.absolutify(URL.unrelativize(l, scheme), base_url) }.compact.uniq end # Internal links found on the page, as absolute URLs def internal_links @internal_links ||= links.select {|link| URL.new(link).host == host } end # External links found on the page, as absolute URLs def external_links @external_links ||= links.select {|link| URL.new(link).host != host } end # Images found on the page, as absolute URLs def images @images ||= parsed_images.map{ |i| URL.absolutify(i, base_url) } end # Returns the parsed image from Facebook's open graph property tags # Most all major websites now define this property and is usually very relevant # See doc at http://developers.facebook.com/docs/opengraph/ def image meta_og_image || meta_twitter_image end # Returns the parsed document meta rss link def feed @feed ||= (parsed_feed('rss') || parsed_feed('atom')) end # Returns the charset from the meta tags, looking for it in the following order: # <meta charset='utf-8' /> # <meta http-equiv="Content-Type" content="text/html; charset=windows-1252" /> def charset @charset ||= (charset_from_meta_charset || charset_from_meta_content_type) end def respond_to?(method_name, include_private = false) MetaInspector::MetaTagsDynamicMatch.new(method_name).match? || super end private def defaults { exception_log: MetaInspector::ExceptionLog.new } end # Scrapers for all meta_tags in the form of "meta_name" are automatically defined. This has been tested for # meta name: keywords, description, robots, generator # meta http-equiv: content-language, Content-Type # # It will first try with meta name="..." and if nothing found, # with meta http-equiv="...", substituting "_" by "-" def method_missing(method_name) meta_tags_method = MetaInspector::MetaTagsDynamicMatch.new(method_name) if meta_tags_method.match? key = meta_tags_method.meta_tag #special treatment for opengraph (og:) and twitter card (twitter:) tags if key =~ /^og_(.*)/ key = og_key(key) elsif key =~ /^twitter_(.*)/ key.gsub!("_",":") end scrape_meta_data @data.meta.name && (@data.meta.name[key.downcase]) || (@data.meta.property && @data.meta.property[key.downcase]) else super end end # Not all OG keys can be directly translated to meta tags method names replacing _ by : as they include the _ in the name # This is going to be deprecated and replaced soon by a simpler, clearer method, like page.meta['og:site_name'] def og_key(key) case key when "og_site_name" "og:site_name" when "og_image_secure_url" "og:image:secure_url" when "og_video_secure_url" "og:video:secure_url" when "og_audio_secure_url" "og:audio:secure_url" else key.gsub("_", ":") end end # Scrapes all meta tags found def scrape_meta_data unless @data.meta @data.meta!.name! @data.meta!.property! parsed_search("//meta").each do |element| get_meta_name_or_property(element) end end end # Store meta tag value, looking at meta name or meta property def get_meta_name_or_property(element) name_or_property = element.attributes["name"] ? "name" : (element.attributes["property"] ? "property" : nil) content_or_value = element.attributes["content"] ? "content" : (element.attributes["value"] ? "value" : nil) if !name_or_property.nil? && !content_or_value.nil? @data.meta.name[element.attributes[name_or_property].value.downcase] = element.attributes[content_or_value].value end end # Look for the first <p> block with 120 characters or more def secondary_description first_long_paragraph = parsed_search('//p[string-length() >= 120]').first first_long_paragraph ? first_long_paragraph.text : '' end def parsed_links @parsed_links ||= cleanup_nokogiri_values(parsed_search("//a/@href")) end def parsed_images @parsed_images ||= cleanup_nokogiri_values(parsed_search('//img/@src')) end def parsed_feed(format) feed = parsed_search("//link[@type='application/#{format}+xml']").first feed ? URL.absolutify(feed.attributes['href'].value, base_url) : nil end def charset_from_meta_charset parsed.css("meta[charset]")[0].attributes['charset'].value rescue nil end def charset_from_meta_content_type parsed.css("meta[http-equiv='Content-Type']")[0].attributes['content'].value.split(";")[1].split("=")[1] rescue nil end # Returns the base url to absolutify relative links. This can be the one set on a <base> tag, # or the url of the document if no <base> tag was found. def base_url base_href || url end # Returns the value of the href attribute on the <base /> tag, if it exists def base_href parsed_search('base').first.attributes['href'].value rescue nil end # Takes a nokogiri search result, strips the values, rejects the empty ones, and removes duplicates def cleanup_nokogiri_values(results) results.map { |a| a.value.strip }.reject { |s| s.empty? }.uniq end # Searches the parsed document for the selector, if the parsed document is searchable def parsed_search(selector) parsed.respond_to?(:search) ? parsed.search(selector) : [] end end end