require 'maruku' require 'redcarpet' require 'nokogiri' require 'uri' require 'sanitize' require 'ostruct' require_relative './devcenter-parser/header_id_generator' require_relative './devcenter-parser/github_parser' require_relative './devcenter-parser/maruku_parser' module DevcenterParser AVAILABLE_FLAVOURS = [:github, :maruku] class InvalidMarkdownError < Exception; end class InvalidRawHTMLError < Exception; end class UnknownFlavourError < Exception; end def self.to_html(markdown, flavour) html = to_unsanitized_html(markdown, flavour.to_sym) sanitize(html) end def self.to_unsanitized_html(markdown, flavour) raise(UnknownFlavourError, "Markdown flavour '#{flavour}' not supported") unless %w{ maruku github }.include?(flavour.to_s) markdown = normalize_markdown(markdown) markdown_parser = flavour.to_s == 'maruku' ? MarukuParser : GitHubParser doc = markdown_parser.parse(markdown) doc_to_html(doc) rescue InvalidRawHTMLError => e raise InvalidMarkdownError, e.message rescue => e raise InvalidMarkdownError, parse_maruku_error(e.message) end def self.sanitize(html) Sanitize.clean(html, sanitize_config) end private def self.doc_to_html(doc) HeaderIdGenerator.apply!(doc) convert_to_article_links_all_relative_links_with_missing_initial_slashes(doc) html = doc.to_html(:encoding => 'utf-8') verify_raw_html(html) html end def self.normalize_markdown(markdown) markdown = normalize_newlines(markdown.to_s) markdown = remove_tags_inside_html_comments(markdown) separate_consecutive_blockquote_blocks(markdown) end # The current parsers consider something like: # > foo # # > bar # as a single blockquote, while we want it to be two different ones. # This method adds an empty paragraph between consecutive blocks so parsers process them separately def self.separate_consecutive_blockquote_blocks(markdown) separator = '' markdown.gsub(/^>(.*)?\s\s>/, '>\1' + "\n\n#{separator}\n\n>") end def self.normalize_newlines(markdown) markdown.lines.map{ |l| l.rstrip }.join("\n") end def self.remove_tags_inside_html_comments(markdown) markdown.gsub // do inner = $1 inner.gsub!(/\[|\]|\(|\)|\<|\>/,'') "" end end # Note that Sanitize requires all the elements/attributes downcased def self.sanitize_config return @@sanitize_config if defined?(@@sanitize_config) config = Sanitize::Config::RELAXED config[:attributes][:all] += %w{ id class style name width height border align } config[:attributes]['a'] += %w{ target } config[:elements] += %w{ div span hr tt } # embedded videos config[:attributes][:all] += %w{ value src type allowscriptaccess allowfullscreen webkitallowfullscreen mozallowfullscreen frameborder } config[:elements] += %w{ object param embed iframe } config[:add_attributes] = { 'object' => {'allowscriptaccess' => 'never'}, 'embed' => {'allowscriptaccess' => 'never'}, 'param' => {'allowscriptaccess' => 'never'} } @@sanitize_config = config.merge({remove_contents: true, allow_comments: true}) end def self.convert_to_article_links_all_relative_links_with_missing_initial_slashes(doc) doc.css('a').each do |node| unless node['href'].nil? || node['href'] =~ /\Ahttp|\A\/|\Amailto\:|\A#/ node['href'] = "/articles/#{node['href']}".gsub('/articles/articles/', '/articles/') end end end def self.verify_raw_html(html) raise(InvalidRawHTMLError, parse_raw_html_error(html)) if invalid_raw_html?(html) end def self.invalid_raw_html?(html) html.to_s.include?('markdown-html-error') end def self.parse_maruku_error(error_message) lines = error_message.to_s.split("\n") return lines unless lines.size > 1 msg = lines[4].gsub(/\A\|(\s)+|EOF\Z/,'').strip code = lines[6].gsub(/\A\|(\s)+|EOF\Z/,'').strip "#{msg} in \"#{code}\"" end def self.parse_raw_html_error(html) broken_html = html.match(/REXML could not parse this XML\/HTML\:(.+)<\/pre>/m)[1].strip rescue nil broken_html.nil? ? "Contains broken raw HTML." : "This raw HTML is invalid: #{CGI.unescapeHTML(broken_html)}" end end