require 'maruku' require 'redcarpet' require 'nokogiri' require 'uri' require 'sanitize' module DevcenterParser VERSION = '1.3.7' AVAILABLE_FLAVOURS = [:github, :maruku] class InvalidMarkdownError < Exception; end class InvalidRawHTMLError < Exception; end class UnknownFlavourError < Exception; end class HTMLWithPantsRenderer < Redcarpet::Render::HTML include Redcarpet::Render::SmartyPants end def self.to_html(markdown, flavour) html = to_unsanitized_html(markdown, flavour.to_sym) sanitize(html) end def self.to_unsanitized_html(markdown, flavour) markdown = normalize_newlines(markdown.to_s) markdown = separate_consecutive_blockquote_blocks(markdown) doc = case flavour.to_sym when :maruku html = Maruku.new(markdown, :on_error => :raise).to_html doc = Nokogiri::HTML::DocumentFragment.parse(html) maruku_code_blocks(doc) maruku_underscores_to_dashes_in_subheader_anchors(doc) when :github html = github_parser.render(markdown.to_s) doc = Nokogiri::HTML::DocumentFragment.parse(html) github_parse_special_blocks(doc) github_underscores_to_dashes_in_subheader_anchors(doc) else raise UnknownFlavourError, "Markdown flavour '#{flavour}' not supported" end convert_to_article_links_all_relative_links_with_missing_initial_slashes(doc) html = doc.to_html(:encoding => 'utf-8') verify_raw_html(html) html rescue InvalidRawHTMLError => e raise InvalidMarkdownError, e.message rescue => e raise InvalidMarkdownError, parse_maruku_error(e.message) end def self.sanitize(html) Sanitize.clean(html, sanitize_config) end private # The current parsers consider something like: # > foo # # > bar # as a single blockquote, while we want it to be two different ones. # This method adds an empty paragraph between consecutive blocks so parsers process them separately def self.separate_consecutive_blockquote_blocks(markdown) separator = '' markdown.gsub(/^>(.*)?\s\s>/, '>\1' + "\n\n#{separator}\n\n>") end def self.normalize_newlines(markdown) markdown.lines.map{ |l| l.strip }.join("\n") end def self.github_parser @@github_parser ||= Redcarpet::Markdown.new(HTMLWithPantsRenderer, fenced_code_blocks: true, tables: true) end def self.sanitize_config return @@sanitize_config if defined?(@@sanitize_config) config = Sanitize::Config::RELAXED config[:attributes][:all] += %w{ id class style name width height border align } config[:attributes]['a'] += %w{ target } config[:elements] += %w{ div span hr tt } # embedded videos config[:attributes][:all] += %w{ value src type allowscriptaccess allowfullscreen } config[:elements] += %w{ object param embed } config[:add_attributes] = { 'object' => {'allowscriptaccess' => 'never'}, 'embed' => {'allowscriptaccess' => 'never'}, 'param' => {'allowscriptaccess' => 'never'} } @@sanitize_config = config.merge({remove_contents: true, allow_comments: true}) end def self.maruku_code_blocks(doc) doc.css('pre>code').each do |node| if match = node.content.match(/\A\s*:::\s*(\w+)/) lang = match[1] node.content = node.content.gsub(/\A\s*:::\s*\w+\n/, '') node['class'] = lang end end doc end def self.maruku_underscores_to_dashes_in_subheader_anchors(doc) doc.css("h2,h3,h4,h5,h6").each do |node| node['id'] = subheader_id(node.content) end doc end def self.github_underscores_to_dashes_in_subheader_anchors(doc) doc.css("h2,h3,h4,h5,h6").each do |node| node['id'] = subheader_id(node.content) end doc end def self.subheader_id(content) content.to_s.downcase.gsub(/\W+/, '-').gsub(/\A-+|-+\Z/, '') end def self.github_parse_special_blocks(doc) doc.css('blockquote>p:first').each do |node| if match = node.inner_html.match(/\A\W*(callout|warning|note)\W/) node.parent.name = 'div' node.parent['class'] = match[1] new_html = node.inner_html.gsub(/\A\W*(callout|warning|note)\W/, '') # Assigning inner_html directly causes encoding issues in old libxml versions, # workaround from https://github.com/sparklemotion/nokogiri/issues/458#issuecomment-3136620 node.children = Nokogiri::HTML.fragment(new_html, 'utf-8') end end end def self.convert_to_article_links_all_relative_links_with_missing_initial_slashes(doc) doc.css('a').each do |node| unless node['href'].nil? || node['href'] =~ /\Ahttp|\A\/|\Amailto\:|\A#/ node['href'] = "/articles/#{node['href']}" end end end def self.verify_raw_html(html) raise(InvalidRawHTMLError, parse_raw_html_error(html)) if invalid_raw_html?(html) end def self.invalid_raw_html?(html) html.to_s.include?('markdown-html-error') end def self.parse_maruku_error(error_message) lines = error_message.to_s.split("\n") return lines unless lines.size > 1 msg = lines[4].gsub(/\A\|(\s)+|EOF\Z/,'').strip code = lines[6].gsub(/\A\|(\s)+|EOF\Z/,'').strip "#{msg} in \"#{code}\"" end def self.parse_raw_html_error(html) broken_html = html.match(/REXML could not parse this XML\/HTML\:(.+)<\/pre>/m)[1].strip rescue nil broken_html.nil? ? "Contains broken raw HTML." : "This raw HTML is invalid: #{CGI.unescapeHTML(broken_html)}" end end