lib/kramdown/parser.rb in kramdown-0.1.0 vs lib/kramdown/parser.rb in kramdown-0.2.0

- old
+ new

@@ -82,14 +82,14 @@ ####### private ####### BLOCK_PARSERS = [:blank_line, :codeblock, :codeblock_fenced, :blockquote, :atx_header, - :setext_header, :horizontal_rule, :list, :link_definition, :block_html, + :setext_header, :horizontal_rule, :list, :definition_list, :link_definition, :block_html, :footnote_definition, :ald, :block_ial, :extension_block, :eob_marker, :paragraph] SPAN_PARSERS = [:emphasis, :codespan, :autolink, :span_html, :footnote_marker, :link, - :span_ial, :html_entity, :typographic_syms, :special_html_chars, :line_break, :escaped_chars,] + :span_ial, :html_entity, :typographic_syms, :line_break, :escaped_chars] # Adapt the object to allow parsing like specified in the options. def configure_parser @parsers = {} BLOCK_PARSERS.each do |name| @@ -290,10 +290,11 @@ EOB_MARKER = /^\^\s*?\n/ # Parse the EOB marker at the current location. def parse_eob_marker @src.pos += @src.matched_size + @tree.children << Element.new(:eob) true end Registry.define_parser(:block, :eob_marker, EOB_MARKER, self) @@ -310,42 +311,44 @@ end true end Registry.define_parser(:block, :paragraph, PARAGRAPH_START, self) + HEADER_ID=/(?:[ \t]\{#((?:\w|\d)[\w\d-]*)\})?/ + SETEXT_HEADER_START = /^(#{OPT_SPACE}[^ \t].*?)#{HEADER_ID}[ \t]*?\n(-|=)+\s*?\n/ - SETEXT_HEADER_START = /^(#{OPT_SPACE}[^ \t].*?)\n(-|=)+\s*?\n/ - # Parse the Setext header at the current location. def parse_setext_header if @tree.children.last && @tree.children.last.type != :blank return false end @src.pos += @src.matched_size - text, level = @src[1].strip, @src[2] + text, id, level = @src[1].strip, @src[2], @src[3] el = Element.new(:header, nil, :level => (level == '-' ? 2 : 1)) add_text(text, el) - el.options[:attr] = {:id => generate_id(text)} if @doc.options[:auto_ids] + el.options[:attr] = {'id' => id} if id + el.options[:attr] = {'id' => generate_id(text)} if @doc.options[:auto_ids] && !id @tree.children << el true end Registry.define_parser(:block, :setext_header, SETEXT_HEADER_START, self) ATX_HEADER_START = /^\#{1,6}/ - ATX_HEADER_MATCH = /^(\#{1,6})(.+?)\s*?#*\s*?\n/ + ATX_HEADER_MATCH = /^(\#{1,6})(.+?)\s*?#*#{HEADER_ID}\s*?\n/ # Parse the Atx header at the current location. def parse_atx_header if @tree.children.last && @tree.children.last.type != :blank return false end result = @src.scan(ATX_HEADER_MATCH) - level, text = @src[1], @src[2].strip + level, text, id = @src[1], @src[2].strip, @src[3] el = Element.new(:header, nil, :level => level.length) add_text(text, el) - el.options[:attr] = {:id => generate_id(text)} if @doc.options[:auto_ids] + el.options[:attr] = {'id' => id} if id + el.options[:attr] = {'id' => generate_id(text)} if @doc.options[:auto_ids] && !id @tree.children << el true end Registry.define_parser(:block, :atx_header, ATX_HEADER_START, self) @@ -396,11 +399,11 @@ end end Registry.define_parser(:block, :codeblock_fenced, FENCED_CODEBLOCK_START, self) - HR_START = /^#{OPT_SPACE}(\*|-|_) *\1 *\1 *(\1| )*\n/ + HR_START = /^#{OPT_SPACE}(\*|-|_)[ \t]*\1[ \t]*\1[ \t]*(\1|[ \t])*\n/ # Parse the horizontal rule at the current location. def parse_horizontal_rule @src.pos += @src.matched_size @tree.children << Element.new(:hr) @@ -429,27 +432,14 @@ nested_list_found = false while !@src.eos? if @src.check(HR_START) break elsif @src.scan(list_start_re) - indentation, content = @src[1].length, @src[2] item = Element.new(:li) + item.value, indentation, content_re, indent_re = parse_first_list_line(@src[1].length, @src[2]) list.children << item - if content =~ /^\s*\n/ - indentation = 4 - else - while content =~ /^ *\t/ - temp = content.scan(/^ */).first.length + indentation - content.sub!(/^( *)(\t+)/) {$1 + " "*(4 - (temp % 4)) + " "*($2.length - 1)*4} - end - indentation += content.scan(/^ */).first.length - end - content.sub!(/^\s*/, '') - item.value = content - indent_re = /^ {#{indentation}}/ - content_re = /^(?:(?:\t| {4}){#{indentation / 4}} {#{indentation % 4}}|(?:\t| {4}){#{indentation / 4 + 1}}).*?\n/ list_start_re = (type == :ul ? /^( {0,#{[3, indentation - 1].min}}[+*-])([\t| ].*?\n)/ : /^( {0,#{[3, indentation - 1].min}}\d+\.)([\t| ].*?\n)/) nested_list_found = false elsif result = @src.scan(content_re) result.sub!(/^(\t+)/) { " "*4*($1 ? $1.length : 0) } @@ -489,11 +479,11 @@ (item == list.children.last && item.children.length == 2 && !eob_found)) text = item.children.shift.children.first text.value += "\n" if !item.children.empty? && item.children[0].type != :blank item.children.unshift(text) else - item.options[:first_as_block] = true + item.options[:first_is_block] = true end if item.children.last.type == :blank last = item.children.pop else @@ -505,11 +495,115 @@ true end Registry.define_parser(:block, :list, LIST_START, self) + def parse_first_list_line(indentation, content) + if content =~ /^\s*\n/ + indentation = 4 + else + while content =~ /^ *\t/ + temp = content.scan(/^ */).first.length + indentation + content.sub!(/^( *)(\t+)/) {$1 + " "*(4 - (temp % 4)) + " "*($2.length - 1)*4} + end + indentation += content.scan(/^ */).first.length + end + content.sub!(/^\s*/, '') + indent_re = /^ {#{indentation}}/ + content_re = /^(?:(?:\t| {4}){#{indentation / 4}} {#{indentation % 4}}|(?:\t| {4}){#{indentation / 4 + 1}}).*?\n/ + [content, indentation, content_re, indent_re] + end + + + DEFINITION_LIST_START = /^(#{OPT_SPACE}:)([\t| ].*?\n)/ + + # Parse the ordered or unordered list at the current location. + def parse_definition_list + children = @tree.children + if !children.last || (children.length == 1 && children.last.type != :p ) || + (children.length >= 2 && children[-1].type != :p && (children[-1].type != :blank || children[-1].value != "\n" || children[-2].type != :p)) + return false + end + + first_as_para = false + deflist = Element.new(:dl) + para = @tree.children.pop + if para.type == :blank + para = @tree.children.pop + first_as_para = true + end + para.children.first.value.split("\n").each do |term| + el = Element.new(:dt) + el.children << Element.new(:text, term) + deflist.children << el + end + + item = nil + indent_re = nil + content_re = nil + def_start_re = DEFINITION_LIST_START + while !@src.eos? + if @src.scan(def_start_re) + item = Element.new(:dd) + item.options[:first_as_para] = first_as_para + item.value, indentation, content_re, indent_re = parse_first_list_line(@src[1].length, @src[2]) + deflist.children << item + + def_start_re = /^( {0,#{[3, indentation - 1].min}}:)([\t| ].*?\n)/ + first_as_para = false + elsif result = @src.scan(content_re) + result.sub!(/^(\t+)/) { " "*4*($1 ? $1.length : 0) } + result.sub!(indent_re, '') + item.value << result + first_as_para = false + elsif result = @src.scan(BLANK_LINE) + first_as_para = true + item.value << result + else + break + end + end + + last = nil + deflist.children.each do |item| + next if item.type == :dt + + parse_blocks(item, item.value) + item.value = nil + next if item.children.size == 0 + + if item.children.last.type == :blank + last = item.children.pop + else + last = nil + end + if item.children.first.type == :p && !item.options.delete(:first_as_para) + text = item.children.shift.children.first + text.value += "\n" if !item.children.empty? + item.children.unshift(text) + else + item.options[:first_is_block] = true + end + end + + if @tree.children.length >= 1 && @tree.children.last.type == :dl + @tree.children[-1].children += deflist.children + elsif @tree.children.length >= 2 && @tree.children[-1].type == :blank && @tree.children[-2].type == :dl + @tree.children.pop + @tree.children[-1].children += deflist.children + else + @tree.children << deflist + end + + @tree.children << last if !last.nil? + + true + end + Registry.define_parser(:block, :definition_list, DEFINITION_LIST_START, self) + + PUNCTUATION_CHARS = "_.:,;!?-" LINK_ID_CHARS = /[a-zA-Z0-9 #{PUNCTUATION_CHARS}]/ LINK_ID_NON_CHARS = /[^a-zA-Z0-9 #{PUNCTUATION_CHARS}]/ LINK_DEFINITION_START = /^#{OPT_SPACE}\[(#{LINK_ID_CHARS}+)\]:[ \t]*(?:<(.*?)>|([^\s]+))[ \t]*?(?:\n?[ \t]*?(["'])(.+?)\4[ \t]*?)?\n/ @@ -610,24 +704,38 @@ #:stopdoc: # The following regexps are based on the ones used by REXML, with some slight modifications. #:startdoc: HTML_COMMENT_RE = /<!--(.*?)-->/m HTML_INSTRUCTION_RE = /<\?(.*?)\?>/m - HTML_ATTRIBUTE_RE = /\s*(#{REXML::Parsers::BaseParser::UNAME_STR})\s*=\s*(["'])(.*?)\2/ - HTML_TAG_RE = /<((?>#{REXML::Parsers::BaseParser::UNAME_STR}))\s*((?>\s+#{REXML::Parsers::BaseParser::UNAME_STR}\s*=\s*(["']).*?\3)*)\s*(\/)?>/ + HTML_ATTRIBUTE_RE = /\s*(#{REXML::Parsers::BaseParser::UNAME_STR})\s*=\s*(["'])(.*?)\2/m + HTML_TAG_RE = /<((?>#{REXML::Parsers::BaseParser::UNAME_STR}))\s*((?>\s+#{REXML::Parsers::BaseParser::UNAME_STR}\s*=\s*(["']).*?\3)*)\s*(\/)?>/m HTML_TAG_CLOSE_RE = /<\/(#{REXML::Parsers::BaseParser::NAME_STR})\s*>/ - HTML_PARSE_AS_BLOCK = %w{div blockquote table dl ol ul form fieldset} - HTML_PARSE_AS_SPAN = %w{a address b dd dt em h1 h2 h3 h4 h5 h6 legend li p pre span td th} - HTML_PARSE_AS_RAW = %w{script math} - HTML_PARSE_AS = Hash.new {|h,k| h[k] = :span} + HTML_PARSE_AS_BLOCK = %w{applet button blockquote colgroup dd div dl fieldset form iframe li + map noscript object ol table tbody td th thead tfoot tr ul} + HTML_PARSE_AS_SPAN = %w{a abbr acronym address b bdo big cite caption code del dfn dt em + h1 h2 h3 h4 h5 h6 i ins kbd label legend optgroup p pre q rb rbc + rp rt rtc ruby samp select small span strong sub sup tt var} + HTML_PARSE_AS_RAW = %w{script math option textarea} + + HTML_PARSE_AS = Hash.new {|h,k| h[k] = :raw} HTML_PARSE_AS_BLOCK.each {|i| HTML_PARSE_AS[i] = :block} HTML_PARSE_AS_SPAN.each {|i| HTML_PARSE_AS[i] = :span} HTML_PARSE_AS_RAW.each {|i| HTML_PARSE_AS[i] = :raw} - HTML_BLOCK_ELEMENTS = %w[div p pre h1 h2 h3 h4 h5 h6 hr form fieldset iframe legend script dl ul ol table ins del blockquote address] + #:stopdoc: + # Some HTML elements like script belong to both categories (i.e. are valid in block and + # span HTML) and don't appear therefore! + #:startdoc: + HTML_SPAN_ELEMENTS = %w{a abbr acronym b big bdo br button cite code del dfn em i img input + ins kbd label option q rb rbc rp rt rtc ruby samp select small span + strong sub sup textarea tt var} + HTML_BLOCK_ELEMENTS = %w{address applet button blockquote caption col colgroup dd div dl dt fieldset + form h1 h2 h3 h4 h5 h6 hr iframe legend li map ol optgroup p pre table tbody + td th thead tfoot tr ul} + HTML_ELEMENTS_WITHOUT_BODY = %w{area br col hr img input} HTML_BLOCK_START = /^#{OPT_SPACE}<(#{REXML::Parsers::BaseParser::UNAME_STR}|\?|!--|\/)/ # Parse the HTML at the current position as block level HTML. def parse_block_html @@ -638,89 +746,129 @@ elsif result = @src.scan(HTML_INSTRUCTION_RE) @tree.children << Element.new(:html_raw, result, :type => :block) @src.scan(/.*?\n/) true else - if !((@src.check(/^#{OPT_SPACE}#{HTML_TAG_RE}/) && (HTML_BLOCK_ELEMENTS.include?(@src[1]) || @src[1] =~ /:/)) || - @src.check(/^#{OPT_SPACE}#{HTML_TAG_CLOSE_RE}/)) - return false + if (!@src.check(/^#{OPT_SPACE}#{HTML_TAG_RE}/) && !@src.check(/^#{OPT_SPACE}#{HTML_TAG_CLOSE_RE}/)) || + HTML_SPAN_ELEMENTS.include?(@src[1]) + if @tree.type == :html_element && @tree.options[:parse_type] != :block + add_html_text(@src.scan(/.*?\n/), @tree) + add_html_text(@src.scan_until(/(?=#{HTML_BLOCK_START})|\Z/), @tree) + return true + else + return false + end end - @src.scan(/^(.*?)\n/) - line = @src[1] - temp = nil + current_el = (@tree.type == :html_element ? @tree : nil) + @src.scan(/^(#{OPT_SPACE})(.*?)\n/) + if current_el && current_el.options[:parse_type] == :raw + add_html_text(@src[1], current_el) + end + line = @src[2] stack = [] while line.size > 0 index_start_tag, index_close_tag = line.index(HTML_TAG_RE), line.index(HTML_TAG_CLOSE_RE) - if index_start_tag && (!index_close_tag || index_start_tag < index_close_tag) && (!temp || temp.options[:parse_type] == :block) + if index_start_tag && (!index_close_tag || index_start_tag < index_close_tag) md = line.match(HTML_TAG_RE) - break if !(HTML_BLOCK_ELEMENTS.include?(md[1]) || md[1] =~ /:/) - - add_text(md.pre_match + "\n", temp) if temp line = md.post_match + add_html_text(md.pre_match, current_el) if current_el + if HTML_SPAN_ELEMENTS.include?(md[1]) || (current_el && current_el.options[:parse_type] == :span) + add_html_text(md.to_s, current_el) if current_el + next + end attrs = {} md[2].scan(HTML_ATTRIBUTE_RE).each {|name,sep,val| attrs[name] = val} - el = Element.new(:html_element, md[1], :attr => attrs, :type => :block, - :parse_type => HTML_PARSE_AS[md[1]]) - (temp || @tree).children << el - if !md[4] + parse_type = if !current_el || current_el.options[:parse_type] != :raw + (@doc.options[:parse_block_html] ? HTML_PARSE_AS[md[1]] : :raw) + else + :raw + end + if val = get_parse_type(attrs.delete('markdown')) + parse_type = (val == :default ? HTML_PARSE_AS[md[1]] : val) + end + el = Element.new(:html_element, md[1], :attr => attrs, :type => :block, :parse_type => parse_type) + el.options[:no_start_indent] = true if !stack.empty? + el.options[:outer_element] = true if !current_el + el.options[:parent_is_raw] = true if current_el && current_el.options[:parse_type] == :raw + + @tree.children << el + if !md[4] && HTML_ELEMENTS_WITHOUT_BODY.include?(el.value) + warning("The HTML tag '#{el.value}' cannot have any content - auto-closing it") + elsif !md[4] @unclosed_html_tags.push(el) - stack << temp - temp = el + @stack.push(@tree) + stack.push(current_el) + @tree = current_el = el end elsif index_close_tag md = line.match(HTML_TAG_CLOSE_RE) - add_text(md.pre_match, temp) if temp - line = md.post_match + add_html_text(md.pre_match, current_el) if current_el + if @unclosed_html_tags.size > 0 && md[1] == @unclosed_html_tags.last.value el = @unclosed_html_tags.pop - @tree = @stack.pop unless temp - temp = stack.pop - if el.options[:parse_type] == :raw - raise Kramdown::Error, "Bug: please report!" if el.children.size > 1 - el.children.first.type = :raw if el.children.first - end + @tree = @stack.pop + current_el.options[:compact] = true if stack.size > 0 + current_el = stack.pop || (@tree.type == :html_element ? @tree : nil) else - if HTML_BLOCK_ELEMENTS.include?(md[1]) && (temp || @tree).options[:parse_type] == :block - warning("Found invalidly nested HTML closing tag for '#{md[1]}'") + if !HTML_SPAN_ELEMENTS.include?(md[1]) && @tree.options[:parse_type] != :span + warning("Found invalidly used HTML closing tag for '#{md[1]}'") + elsif current_el + add_html_text(md.to_s, current_el) end - if temp - add_text(md.to_s, temp) - else - add_text(md.to_s + "\n") - end end else - if temp - add_text(line, temp) + if current_el + line.rstrip! if current_el.options[:parse_type] == :block + add_html_text(line + "\n", current_el) else - warning("Ignoring characters at the end of an HTML block line") + add_text(line + "\n") end line = '' end end - if temp && temp.children.last && temp.children.last.type == :text - temp.children.last.value << "\n" + if current_el && (current_el.options[:parse_type] == :span || current_el.options[:parse_type] == :raw) + result = @src.scan_until(/(?=#{HTML_BLOCK_START})|\Z/) + last = current_el.children.last + result = "\n" + result if last.nil? || (last.type != :text && last.type != :raw) || last.value !~ /\n\Z/ + add_html_text(result, current_el) end - if temp - if temp.options[:parse_type] == :span || temp.options[:parse_type] == :raw - result = @src.scan_until(/(?=#{HTML_BLOCK_START})|\Z/) - add_text(result, temp) - end - @stack.push(@tree) - @tree = temp - end true end end Registry.define_parser(:block, :block_html, HTML_BLOCK_START, self) + # Return the HTML parse type defined by the string +val+, i.e. raw when "0", default parsing + # (return value +nil+) when "1", span parsing when "span" and block parsing when "block". If + # +val+ is nil, then the default parsing mode is used. + def get_parse_type(val) + case val + when "0" then :raw + when "1" then :default + when "span" then :span + when "block" then :block + when NilClass then nil + else + warning("Invalid markdown attribute val '#{val}', using default") + nil + end + end + # Special version of #add_text which either creates a :text element or a :raw element, + # depending on the HTML element type. + def add_html_text(text, tree) + type = (tree.options[:parse_type] == :raw ? :raw : :text) + if tree.children.last && tree.children.last.type == type + tree.children.last.value << text + elsif !text.empty? + tree.children << Element.new(type, text) + end + end ESCAPED_CHARS = /\\([\\.*_+-`()\[\]{}#!])/ # Parse the backslash-escaped character at the current location. @@ -732,51 +880,46 @@ # Parse the HTML entity at the current location. def parse_html_entity @src.pos += @src.matched_size - add_text(@src.matched) + @tree.children << Element.new(:entity, @src.matched) end Registry.define_parser(:span, :html_entity, REXML::Parsers::BaseParser::REFERENCE_RE, self) - SPECIAL_HTML_CHARS = /&|>|</ - - # Parse the special HTML characters at the current location. - def parse_special_html_chars - @src.pos += @src.matched_size - add_text(@src.matched) - end - Registry.define_parser(:span, :special_html_chars, SPECIAL_HTML_CHARS, self) - - LINE_BREAK = /( |\\\\)(?=\n)/ # Parse the line break at the current location. def parse_line_break @src.pos += @src.matched_size @tree.children << Element.new(:br) end Registry.define_parser(:span, :line_break, LINE_BREAK, self) - TYPOGRAPHIC_SYMS = [['---', '&mdash;'], ['--', '&ndash;'], ['...', '&hellip;'], + TYPOGRAPHIC_SYMS = [['---', :mdash], ['--', :ndash], ['...', :ellipsis], ['\\<<', '&lt;&lt;'], ['\\>>', '&gt;&gt;'], - ['<< ', '&laquo;&nbsp;'], [' >>', '&nbsp;&raquo;'], - ['<<', '&laquo;'], ['>>', '&raquo;']] + ['<< ', :laquo_space], [' >>', :raquo_space], + ['<<', :laquo], ['>>', :raquo]] TYPOGRAPHIC_SYMS_SUBST = Hash[*TYPOGRAPHIC_SYMS.flatten] TYPOGRAPHIC_SYMS_RE = /#{TYPOGRAPHIC_SYMS.map {|k,v| Regexp.escape(k)}.join('|')}/ # Parse the typographic symbols at the current location. def parse_typographic_syms @src.pos += @src.matched_size - add_text(TYPOGRAPHIC_SYMS_SUBST[@src.matched].dup) + val = TYPOGRAPHIC_SYMS_SUBST[@src.matched] + if val.kind_of?(Symbol) + @tree.children << Element.new(:typographic_sym, val) + else + add_text(val.dup) + end end Registry.define_parser(:span, :typographic_syms, TYPOGRAPHIC_SYMS_RE, self) - AUTOLINK_START = /<((mailto|https?|ftps?):.*?|.*?@.*?)>/ + AUTOLINK_START = /<((mailto|https?|ftps?):.*?|\S*?@\S*?)>/ # Parse the autolink at the current location. def parse_autolink @src.pos += @src.matched_size @@ -914,30 +1057,55 @@ if result = @src.scan(HTML_COMMENT_RE) @tree.children << Element.new(:html_raw, result, :type => :span) elsif result = @src.scan(HTML_INSTRUCTION_RE) @tree.children << Element.new(:html_raw, result, :type => :span) elsif result = @src.scan(HTML_TAG_RE) + if HTML_BLOCK_ELEMENTS.include?(@src[1]) + add_text(result) + return + end reset_pos = @src.pos attrs = {} - @src[2].scan(HTML_ATTRIBUTE_RE).each {|name,sep,val| attrs[name] = val} + @src[2].scan(HTML_ATTRIBUTE_RE).each {|name,sep,val| attrs[name] = val.gsub(/\n+/, ' ')} + + do_parsing = @doc.options[:parse_span_html] + if val = get_parse_type(attrs.delete('markdown')) + if val == :block + warning("Cannot use block level parsing in span level HTML tag - using default mode") + elsif val == :span || val == :default + do_parsing = true + elsif val == :raw + do_parsing = false + end + end + do_parsing = false if HTML_PARSE_AS_RAW.include?(@src[1]) + el = Element.new(:html_element, @src[1], :attr => attrs, :type => :span) + stop_re = /<\/#{Regexp.escape(@src[1])}\s*>/ if @src[4] @tree.children << el + elsif HTML_ELEMENTS_WITHOUT_BODY.include?(el.value) + warning("The HTML tag '#{el.value}' cannot have any content - auto-closing it") + @tree.children << el else - stop_re = /<\/#{Regexp.escape(@src[1])}\s*>/ if parse_spans(el, stop_re) + end_pos = @src.pos @src.scan(stop_re) @tree.children << el + if !do_parsing + el.children.clear + el.children << Element.new(:raw, @src.string[reset_pos...end_pos]) + end else @src.pos = reset_pos add_text(result) end end else add_text(@src.scan(/./)) end end - Registry.define_parser(:span, :span_html, HTML_BLOCK_START, self) + Registry.define_parser(:span, :span_html, HTML_SPAN_START, self) LINK_TEXT_BRACKET_RE = /\\\[|\\\]|\[|\]/ LINK_INLINE_ID_RE = /\s*?\[(#{LINK_ID_CHARS}+)?\]/ LINK_INLINE_TITLE_RE = /\s*?(["'])(.+?)\1\s*?\)/