# Copyright 2014 Bob Aman # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. require 'mime/types' require 'faraday' require 'nokogiri' require 'sanitize' require 'fastimage' require 'href_preview/fastimage_uri' require 'time' module HRefPreview class Preview ## # Initializes a `Preview` from an HTTP response. def initialize(response, connection=DEFAULT_CONNECTION) @response = response @connection = connection end attr_reader :response ## # Returns the MIME type declared in the HTTP headers or HTML meta # tags. # # @return [MIME::Type] The MIME type of the HTTP response. def mime_type @mime_type ||= (begin MIME::Types[response.headers['Content-Type']].first or begin node = dom.xpath("//*/meta[@http-equiv='Content-Type']/@content").first MIME::Types[node.value].first if node && node.value end or begin node = dom.xpath("//*/meta[@name='dc.format']/@content").first MIME::Types[node.value].first if node && node.value end end) end ## # Returns the charset declared in the HTTP headers or HTML meta # tags. # # @return [String] The charset of the HTTP response. def charset @charset ||= (begin charset = response.headers['Content-Type'].to_s[/;\s*charset=([^;,]*)/, 1] or begin node = dom.xpath("//*/meta[@http-equiv='Content-Type']/@content").first node.value.to_s[/;\s*charset=([^;,]*)/, 1] if node end or begin node = dom.xpath("//*/meta/@charset").first node.value if node end charset.strip if charset end) end ## # @returns [String] The two-letter language code for the content. def language @language ||= (begin language = response.headers['Content-Language'] or begin node = dom.xpath("//*/meta[@http-equiv='Content-Language']/@content").first node.value if node end or begin node = dom.xpath("//*/meta[@name='dc.language']/@content").first node.value if node end if language # Strip the irrelevant '-US' from 'en-US' if it appears. language[/^([a-z]{2})/, 1].to_s.downcase end end) end ## # Returns true if the response had a 2xx HTTP code and the mime type # is either HTML or XHTML. # # @return [true, false] Whether successful HTML response or not. def is_html? return ( response.status >= 200 && response.status < 300 && mime_type && mime_type.sub_type =~ /^x?html/ ) end ## # The DOM for the response body. # # @return [Nokogiri::HTML::Document] # The DOM, as generated by Nokogiri. def dom @dom ||= Nokogiri::HTML(response.body) end ## # @return [String] The title of the page. def title @title ||= (begin if is_html? title = begin node = dom.xpath("//*/meta[@property='og:title']/@content").first node.value if node end or begin node = dom.xpath("//*/meta[@name='dc.title']/@content").first node.value if node end or begin if article_node node = article_node.xpath("*[@itemprop='headline']").first node.text if node end end or begin node = dom.xpath("//*/*[(self::h1 or self::h2) and @itemprop='headline']").first node.text if node end or begin node = dom.xpath("//*/head/title").first node.text if node end or begin # Unlikely to ever happen node = dom.xpath("//*/meta[@name='twitter:title']/@content").first node.value if node end or begin # Unlikely to ever happen node = dom.xpath("//*/meta[@name='sailthru.title']/@content").first node.value if node end if title title.gsub!(/ /, ' ') title.gsub!(/^#{site_name}[\s\|\-\:]*/, '') title.gsub!(/[\s\|\-\:]*#{site_name}$/, '') title.strip end end end) end def description @description ||= (begin if is_html? description = begin node = dom.xpath("//*/meta[@property='og:description']/@content").first node.value if node end or begin node = dom.xpath("//*/meta[@name='dc.description']/@content").first node.value if node end or begin node = dom.xpath("//*/meta[@itemprop='description']/@content").first node.value if node end or begin node = dom.xpath("//*/meta[@name='description']/@content").first node.value if node end or begin node = dom.xpath("//*/meta[@name='dcterms.abstract']/@content").first node.value if node end or begin # Unlikely to ever happen node = dom.xpath("//*/meta[@name='twitter:description']/@content").first node.value if node end or begin # Unlikely to ever happen node = dom.xpath("//*/meta[@name='sailthru.description']/@content").first node.value if node end if description description.gsub!(/ /, ' ') description.strip end end end) end def canonical_uri @canonical_uri ||= (if is_html? begin node = dom.xpath("//*/link[@rel='canonical']/@href").first Addressable::URI.parse(node.value) if node && node.value && node.value != '' end or begin node = dom.xpath("//*/meta[@property='og:url']/@content").first Addressable::URI.parse(node.value) if node && node.value && node.value != '' end or Addressable::URI.parse(response.env.url.to_s) else Addressable::URI.parse(response.env.url.to_s) end) end def shortlink_uri @shortlink_uri ||= (if is_html? begin node = dom.xpath("//*/link[@rel='shortlink']/@href").first Addressable::URI.parse(node.value) if node && node.value && node.value != '' end or begin node = dom.xpath("//*[@class='story-short-url']/a/@href").first Addressable::URI.parse(node.value) if node && node.value && node.value != '' end end) end def image_uri @image_uri ||= (images.first ? Addressable::URI.parse(images.first.uri) : nil) end def images @images ||= (begin image_uris = [] if is_html? nodes = dom.xpath("//*/meta[@property='og:image']/@content") nodes.each do |node| if node && node.value && node.value != '' image_uris << Addressable::URI.parse(node.value) end end if article_node nodes = article_node.xpath("meta[@itemprop='thumbnailurl']/@content") nodes.each do |node| if node && node.value && node.value != '' image_uris << Addressable::URI.parse(node.value) end end end elsif mime_type && mime_type.media_type == 'image' image_uris << canonical_uri end image_uris.uniq.map { |uri| FastImage.new(uri, :timeout => 0.5) } end) end def item_type @item_type ||= (if is_html? begin node = dom.xpath("//*/meta[@property='og:type']/@content").first node.value if node end or if dom.xpath("//*[@itemtype='http://schema.org/NewsArticle']").first != nil 'article' end end) end def site_name @site_name ||= (if is_html? begin node = dom.xpath("//*/meta[@property='og:site_name']/@content").first node.value if node end or begin node = dom.xpath("//*/meta[@name='dc.publisher']/@content").first node.value if node end end) end ## # @return [String] The Twitter handle used by the site. def twitter @twitter ||= (if is_html? begin node = dom.xpath("//*/meta[@name='twitter:site']/@content").first node.value if node && node.value && node.value =~ /^@/ end end) end def article_node @article_node ||= (if is_html? begin nodes = dom.xpath("/html[@itemtype='http://schema.org/NewsArticle']//article[@id='story']") nodes.first if nodes.size == 1 end or begin nodes = dom.xpath("//*/*[@itemtype='http://schema.org/NewsArticle']") nodes.first if nodes.size == 1 end or begin nodes = dom.xpath("//*/*[@itemprop='articleBody']") nodes.first if nodes.size == 1 end or begin nodes = dom.css("article div.article-entry") nodes.first if nodes.size == 1 end or begin nodes = dom.css("article.post div.entry-content") nodes.first if nodes.size == 1 end or begin nodes = dom.css("div.post div.postBody") nodes.first if nodes.size == 1 end or begin nodes = dom.css(".pg_story div#leftcolumn div.body") nodes.first if nodes.size == 1 end end) end options = Sanitize::Config::RELAXED.merge( :remove_contents => true, :elements => %w[ a abbr address b bdi bdo blockquote br caption cite code col colgroup dd del dfn dl dt em figcaption figure h1 h2 h3 h4 h5 h6 hgroup hr i img ins kbd li mark ol p pre q rp rt ruby s samp small span strike strong sub summary sup table tbody td tfoot th thead time tr u ul var wbr ] ) options[:attributes]['span'] = [] SANITIZE_OPTIONS = options def article_html @article_html ||= (if is_html? begin html = nil if article_node html = article_node.children.reject do |child| next unless child.attribute('class') [ 'related_links_inline', 'inline-share-btn-label', 'inline-share-btn' ].include?(child.attribute('class').value) end.map(&:to_s).join('') end if html html = Sanitize.clean(html, SANITIZE_OPTIONS) html.gsub!("\r\n", "\n") html.gsub!("\t", " ") html.gsub!(/ *\n */, "\n") html.gsub!(/\n\n+/, "\n\n") html.gsub!(/
\n+/, "
\n") html.gsub!(/\n+<\/p>/, "\n
") html.gsub!(/<\/p>\n+/, "\n") html.strip! # Excise empty elements reparsed = Nokogiri::HTML.fragment(html) excise_empty = lambda do |node| if node.respond_to?(:name) && node.name == "script" node.unlink else node.children.each do |node| excise_empty.call(node) if node.element? end if node.respond_to?(:attribute_nodes) && node.respond_to?(:text) if node.attribute_nodes.size == 0 && node.text.to_s.strip =~ /^\s*$/ && node.children.all? { |child| child.text? } node.unlink end end end end excise_empty.call(reparsed) html = reparsed.to_s end html end end) end def article_text @article_text ||= is_html? ? Sanitize.clean(article_html) : nil end def published @published ||= (begin # Check under the article node first, otherwise search all begin node = dom.xpath("//*/meta[@property='article:published_time']/@content").first Time.parse(node.value) if node && node.value && node.value != '' end or if article_node node = article_node.xpath("meta[@itemprop='datepublished']/@content").first Time.parse(node.value) if node && node.value && node.value != '' end or begin node = dom.xpath("//*/meta[@itemprop='datepublished']/@content").first Time.parse(node.value) if node && node.value && node.value != '' end or begin node = dom.xpath("//*/meta[@name='dcterms.created']/@content").first Time.parse(node.value) if node && node.value && node.value != '' end or begin # Only a date, not a time, and not particularly specific, # so this is a fallback at best. node = dom.xpath("//*/meta[@name='dc.date']/@content").first Time.parse(node.value) if node && node.value && node.value != '' end end) end def updated @updated ||= (begin # Check under the article node first, otherwise search all begin node = dom.xpath("//*/meta[@property='article:modified_time']/@content").first Time.parse(node.value) if node && node.value && node.value != '' end or if article_node node = article_node.xpath("meta[@itemprop='datemodified']/@content").first Time.parse(node.value) if node && node.value && node.value != '' end or begin node = dom.xpath("meta[@itemprop='datemodified']/@content").first Time.parse(node.value) if node && node.value && node.value != '' end or begin node = dom.xpath("//*/meta[@name='dcterms.modified']/@content").first Time.parse(node.value) if node && node.value && node.value != '' end end) end def inspect addr = '0x' + ('%x' % (object_id << 1)).rjust(14, '0') "#