require 'redcarpet' require 'nokogiri' require 'sanitize' # This utility class is used to # work on html text # # You can initialize it with html or markdown text class HtmlProcessor attr_reader :html, :original #====================================== # Constants #====================================== DESCRIPTION_PROCESSING_ORDER = %w( p h1 h2 h3 h4 h5 h6 ) # Define Youtube transformer for Sanitize YOUTUBE_TRANSFORMER = lambda do |env| node = env[:node] node_name = env[:node_name] # Don't continue if this node is already whitelisted or is not an element. return if env[:is_whitelisted] || !node.element? # Don't continue unless the node is an iframe. return unless node_name == 'iframe' # Verify that the video URL is actually a valid YouTube video URL. return unless node['src'] =~ %r|\A(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/| # We're now certain that this is a YouTube embed, but we still need to run # it through a special Sanitize step to ensure that no unwanted elements or # attributes that don't belong in a YouTube embed can sneak in. Sanitize.node!(node, { :elements => %w[iframe], :attributes => { 'iframe' => %w[allowfullscreen frameborder height src width] } }) # Now that we're sure that this is a valid YouTube embed and that there are # no unwanted elements or attributes hidden inside it, we can tell Sanitize # to whitelist the current node. {:node_whitelist => [node]} end # Default options for Sanitize SANITIZER_OPTS = Sanitize::Config::RELAXED.merge( attributes: Sanitize::Config::RELAXED[:attributes].merge( 'a' => %w[href hreflang name rel target], ), transformers: YOUTUBE_TRANSFORMER ) #====================================== # Methods #====================================== def initialize(text, options = { }) @original = text # Process markdown or leave original if options[:format].to_s == 'markdown' && text html_options = { :safe_links_only => true, :hard_wrap => true, :filter_html => false } renderer_options = { :autolink => true, :no_intraemphasis => true, :fenced_code_blocks => true, :superscript => true } renderer = Redcarpet::Markdown.new(Redcarpet::Render::HTML.new(html_options), renderer_options) raw_html = renderer.render(text) @html = Sanitize.fragment(raw_html, SANITIZER_OPTS) else @html = text end end # Return a Nokogiri document based # on processor html def document @document ||= Nokogiri::HTML(@html) end # Return a description of the document # by returning the first sentence of the # first DESCRIPTION_PROCESSING_ORDER found def description # Return cached value if one return @description if @description # Parse the html document to try to find # a description @description = '' DESCRIPTION_PROCESSING_ORDER.each do |selector| elem = self.document.css(selector).select { |e| e && !e.content.blank? }.first next if elem.blank? #skip if nil or empty # Try to get the first two sentences match = elem.content.match(/([^.!?]+[.!?]?)([^.!?]+[.!?]?)?/) if match && match.captures.any? @description = match.captures.compact.join('') end break if !@description.empty? end return @description end end