lexer.rb in asciidoctor-0.1.1

- old
+ new

@@ -1,5 +1,6 @@
+module Asciidoctor
 # Public: Methods to parse lines of AsciiDoc into an object hierarchy
 # representing the structure of the document. All methods are class methods and
 # should be invoked from the Lexer class. The main entry point is ::next_block.
 # No Lexer instances shall be discovered running around. (Any attempt to
 # instantiate a Lexer will be futile).
@@ -18,13 +19,13 @@
 #   doc = Document.new
 #   reader = Reader.new lines
 #   block = Lexer.next_block(reader, doc)
 #   block.class
 #   # => Asciidoctor::Block
-class Asciidoctor::Lexer
+class Lexer
 
-  include Asciidoctor
+  BlockMatchData = Struct.new(:name, :tip, :terminator)
 
   # Public: Make sure the Lexer object doesn't get initialized.
   #
   # Raises RuntimeError if this constructor is invoked.
   def initialize
@@ -38,32 +39,64 @@
   # proceeds to iterate through the lines in the Reader, parsing the document
   # into nested Sections and Blocks.
   #
   # reader   - the Reader holding the source lines of the document
   # document - the empty Document into which the lines will be parsed
+  # options  - a Hash of options to control processing
   #
   # returns the Document object
-  def self.parse(reader, document)
-    # process and plow away any attribute lines that proceed the first block so
-    # we can get at the document title, if present, then begin parsing blocks
-    reader.skip_blank_lines
-    attributes = parse_block_metadata_lines(reader, document)
+  def self.parse(reader, document, options = {})
+    block_attributes = parse_document_header(reader, document)
 
-    # by processing the header here, we enforce its position at head of the document  
-    next_level = is_next_line_section? reader, attributes
-    if next_level == 0
-      title_info = parse_section_title(reader) 
-      document.title = title_info[1]
-      parse_header_metadata(reader, document)
+    unless options[:header_only]
+      while reader.has_more_lines?
+        new_section, block_attributes = next_section(reader, document, block_attributes)
+        document << new_section unless new_section.nil?
+      end
     end
 
-    while reader.has_lines?
-      new_section, attributes = next_section(reader, document, attributes)
-      document << new_section unless new_section.nil?
+    document
+  end
+
+  # Public: Parses the document header of the AsciiDoc source read from the Reader
+  #
+  # Reads the AsciiDoc source from the Reader until the end of the document
+  # header is reached. The Document object is populated with information from
+  # the header (document title, document attributes, etc). The document
+  # attributes are then saved to establish a save point to which to rollback
+  # after parsing is complete.
+  #
+  # This method assumes that there are no blank lines at the start of the document,
+  # which are automatically removed by the reader.
+  #
+  # returns the Hash of orphan block attributes captured above the header
+  def self.parse_document_header(reader, document)
+    # capture any lines of block-level metadata and plow away any comment lines
+    # that precede first block
+    block_attributes = parse_block_metadata_lines(reader, document)
+
+    # check if the first line is the document title
+    # if so, add a header to the document and parse the header metadata
+    if is_next_line_document_title?(reader, block_attributes)
+      document.id, document.title, _, _ = parse_section_title(reader)
+      # QUESTION: should this be encapsulated in document?
+      if document.id.nil? && block_attributes.has_key?('id')
+        document.id = block_attributes.delete('id')
+      end
+      parse_header_metadata(reader, document)
     end
 
-    document
+    if document.attributes.has_key? 'doctitle'
+      document.title = document.attributes['doctitle']
+    end
+ 
+    document.clear_playback_attributes block_attributes
+    document.save_attributes
+ 
+    # NOTE these are the block-level attributes (not document attributes) that
+    # precede the first line of content (document title, first section or first block)
+    block_attributes
   end
 
   # Public: Return the next section from the Reader.
   #
   # This method process block metadata, content and subsections for this
@@ -143,22 +176,22 @@
     # 2. then look for a section, recurse if found
     # 3. then process blocks
     #
     # We have to parse all the metadata lines before continuing with the loop,
     # otherwise subsequent metadata lines get interpreted as block content
-    while reader.has_lines?
+    while reader.has_more_lines?
       parse_block_metadata_lines(reader, section, attributes)
 
       next_level = is_next_line_section? reader, attributes
       if next_level
         doctype = parent.document.doctype
         if next_level == 0 && doctype != 'book'
-          puts "asciidoctor: ERROR: only book doctypes can contain level 0 sections"
+          puts "asciidoctor: ERROR: line #{reader.lineno + 1}: only book doctypes can contain level 0 sections"
         end
         if next_level > current_level || (section.is_a?(Document) && next_level == 0)
           unless expected_next_levels.nil? || expected_next_levels.include?(next_level)
-            puts "asciidoctor: WARNING: section title out of sequence: " +
+            puts "asciidoctor: WARNING: line #{reader.lineno + 1}: section title out of sequence: " +
                 "expected #{expected_next_levels.size > 1 ? 'levels' : 'level'} #{expected_next_levels * ' or '}, " +
                 "got level #{next_level}"
           end
           # the attributes returned are those that are orphaned
           new_section, attributes = next_section(reader, section, attributes)
@@ -208,27 +241,24 @@
   # parent - The Document, Section or Block to which the next block belongs
   # 
   # Returns a Section or Block object holding the parsed content of the processed lines
   def self.next_block(reader, parent, attributes = {}, options = {})
     # Skip ahead to the block content
-    skipped = reader.skip_blank
+    skipped = reader.skip_blank_lines
 
     # bail if we've reached the end of the section content
-    return nil unless reader.has_lines?
+    return nil unless reader.has_more_lines?
 
     if options[:text] && skipped > 0
       options.delete(:text)
     end
 
-    Asciidoctor.debug {
+    Debug.debug {
       msg = []
       msg << '/' * 64
       msg << 'next_block() - First two lines are:'
-      msg << reader.peek_line
-      tmp_line = reader.get_line
-      msg << reader.peek_line
-      reader.unshift tmp_line
+      msg.concat reader.peek_lines(2)
       msg << '/' * 64
       msg * "\n"
     }
     
     parse_metadata = options[:parse_metadata] || true
@@ -236,121 +266,115 @@
 
     document = parent.document
     context = parent.is_a?(Block) ? parent.context : nil
     block = nil
 
-    while reader.has_lines? && block.nil?
+    while reader.has_more_lines? && block.nil?
       if parse_metadata && parse_block_metadata_line(reader, document, attributes, options)
-        reader.next_line
+        reader.advance
         next
       elsif parse_sections && context.nil? && is_next_line_section?(reader, attributes)
         block, attributes = next_section(reader, parent, attributes)
         break
       end
 
       this_line = reader.get_line
 
-      delimited_blk = delimited_block? this_line
+      block_context = nil
+      terminator = nil
+      if delimited_blk_match = is_delimited_block?(this_line, true)
+        block_context = delimited_blk_match.name
+        terminator = delimited_blk_match.terminator
+      end
 
-      # NOTE I've haven't decided whether I want this check here or in
-      # parse_block_metadata (where it is currently)
-      #if this_line.match(REGEXP[:comment_blk])
-      #  reader.grab_lines_until {|line| line.match( REGEXP[:comment_blk] ) }
-      #  reader.skip_blank
-      #  # NOTE we should break here because we have found a block, it
-      #  # just happens to be nil...if we keep going we potentially overrun
-      #  # a section heading which is not processed in this anymore
-      #  break
+      # NOTE we're letting break lines (ruler, page_break, etc) have attributes
+      if !options[:text] && block_context.nil? && (match = this_line.match(REGEXP[:break_line]))
+        block = Block.new(parent, BREAK_LINES[match[0][0..2]])
+        reader.skip_blank_lines
 
-      # NOTE we're letting ruler have attributes
-      if !options[:text] && this_line.match(REGEXP[:ruler])
-        block = Block.new(parent, :ruler)
-        reader.skip_blank
-
-      elsif !options[:text] && (match = this_line.match(REGEXP[:image_blk]))
+      elsif !options[:text] && block_context.nil? && (match = this_line.match(REGEXP[:image_blk]))
         block = Block.new(parent, :image)
         AttributeList.new(document.sub_attributes(match[2])).parse_into(attributes, ['alt', 'width', 'height'])
         target = block.sub_attributes(match[1])
         if !target.to_s.empty?
           attributes['target'] = target
           document.register(:images, target)
           attributes['alt'] ||= File.basename(target, File.extname(target))
-          # hmmm, this assignment seems like a one-off
           block.title = attributes['title']
-          if block.title? && attributes['caption'].nil?
-            attributes['caption'] = "Figure #{document.counter('figure-number')}. "
+          if block.title? && !attributes.has_key?('caption') && !block.attr?('caption')
+            number = document.counter('figure-number')
+            attributes['caption'] = "#{document.attributes['figure-caption']} #{number}. "
+            Document::AttributeEntry.new('figure-number', number).save_to(attributes)
           end
         else
           # drop the line if target resolves to nothing
           block = nil
         end
-        reader.skip_blank
+        reader.skip_blank_lines
 
-      elsif delimited_blk && (match = this_line.match(REGEXP[:open_blk]))
+      elsif block_context == :open
         # an open block is surrounded by '--' lines and has zero or more blocks inside
-        terminator = match[0]
         buffer = Reader.new reader.grab_lines_until(:terminator => terminator)
 
         # Strip lines off end of block - not implemented yet
-        # while buffer.has_lines? && buffer.last.strip.empty?
+        # while buffer.has_more_lines? && buffer.last.strip.empty?
         #   buffer.pop
         # end
 
-        block = Block.new(parent, :open)
-        while buffer.has_lines?
+        block = Block.new(parent, block_context)
+        while buffer.has_more_lines?
           new_block = next_block(buffer, block)
           block.blocks << new_block unless new_block.nil?
         end
 
       # needs to come before list detection
-      elsif delimited_blk && (match = this_line.match(REGEXP[:sidebar_blk]))
+      elsif block_context == :sidebar
         # sidebar is surrounded by '****' (4 or more '*' chars) lines
-        terminator = match[0]
         # FIXME violates DRY because it's a duplication of quote parsing
-        block = Block.new(parent, :sidebar)
+        block = Block.new(parent, block_context)
         buffer = Reader.new reader.grab_lines_until(:terminator => terminator)
 
-        while buffer.has_lines?
+        while buffer.has_more_lines?
           new_block = next_block(buffer, block)
           block.blocks << new_block unless new_block.nil?
         end
 
-      elsif match = this_line.match(REGEXP[:colist])
+      elsif block_context.nil? && (match = this_line.match(REGEXP[:colist]))
         block = Block.new(parent, :colist)
         attributes['style'] = 'arabic'
         items = []
         block.buffer = items
-        reader.unshift this_line
+        reader.unshift_line this_line
         expected_index = 1
         begin
           # might want to move this check to a validate method
           if match[1].to_i != expected_index
-            puts "asciidoctor: WARNING: callout list item index: expected #{expected_index} got #{match[1]}"
+            puts "asciidoctor: WARNING: line #{reader.lineno + 1}: callout list item index: expected #{expected_index} got #{match[1]}"
           end
           list_item = next_list_item(reader, block, match)
           expected_index += 1
           if !list_item.nil?
             items << list_item
             coids = document.callouts.callout_ids(items.size)
             if !coids.empty?
               list_item.attributes['coids'] = coids
             else
-              puts 'asciidoctor: WARNING: no callouts refer to list item ' + items.size.to_s
+              puts "asciidoctor: WARNING: line #{reader.lineno}: no callouts refer to list item #{items.size}"
             end
           end
-        end while reader.has_lines? && match = reader.peek_line.match(REGEXP[:colist])
+        end while reader.has_more_lines? && match = reader.peek_line.match(REGEXP[:colist])
 
         document.callouts.next_list
 
-      elsif match = this_line.match(REGEXP[:ulist])
+      elsif block_context.nil? && (match = this_line.match(REGEXP[:ulist]))
         AttributeList.rekey(attributes, ['style'])
-        reader.unshift(this_line)
+        reader.unshift_line this_line
         block = next_outline_list(reader, :ulist, parent)
 
-      elsif match = this_line.match(REGEXP[:olist])
+      elsif block_context.nil? && (match = this_line.match(REGEXP[:olist]))
         AttributeList.rekey(attributes, ['style'])
-        reader.unshift(this_line)
+        reader.unshift_line this_line
         block = next_outline_list(reader, :olist, parent)
         # QUESTION move this logic to next_outline_list?
         if !(attributes.has_key? 'style') && !(block.attributes.has_key? 'style')
           marker = block.buffer.first.marker
           if marker.start_with? '.'
@@ -361,103 +385,114 @@
             style = ORDERED_LIST_STYLES.detect{|s| marker.match(ORDERED_LIST_MARKER_PATTERNS[s]) }
             attributes['style'] = (style || ORDERED_LIST_STYLES.first).to_s
           end
         end
 
-      elsif match = this_line.match(REGEXP[:dlist])
-        reader.unshift this_line
+      elsif block_context.nil? && (match = this_line.match(REGEXP[:dlist]))
+        reader.unshift_line this_line
         block = next_labeled_list(reader, match, parent)
         AttributeList.rekey(attributes, ['style'])
 
-      elsif delimited_blk && (match = this_line.match(document.nested? ? REGEXP[:table_nested] : REGEXP[:table]))
+      elsif block_context == :table
         # table is surrounded by lines starting with a | followed by 3 or more '=' chars
-        terminator = match[0]
         AttributeList.rekey(attributes, ['style'])
         table_reader = Reader.new reader.grab_lines_until(:terminator => terminator, :skip_line_comments => true)
         block = next_table(table_reader, parent, attributes)
-        # hmmm, this assignment seems like a one-off
         block.title = attributes['title']
-        if block.title? && attributes['caption'].nil?
-          attributes['caption'] = "Table #{document.counter('table-number')}. "
+        if block.title? && !attributes.has_key?('caption') && !block.attr?('caption')
+          number = document.counter('table-number')
+          attributes['caption'] = "#{document.attributes['table-caption']} #{number}. "
+          Document::AttributeEntry.new('table-number', number).save_to(attributes)
         end
     
       # FIXME violates DRY because it's a duplication of other block parsing
-      elsif delimited_blk && (match = this_line.match(REGEXP[:example]))
+      elsif block_context == :example
         # example is surrounded by lines with 4 or more '=' chars
-        terminator = match[0]
         AttributeList.rekey(attributes, ['style'])
         if admonition_style = ADMONITION_STYLES.detect {|s| attributes['style'] == s}
           block = Block.new(parent, :admonition)
-          attributes['name'] = admonition_style.downcase
-          attributes['caption'] ||= admonition_style.capitalize
+          attributes['name'] = admonition_name = admonition_style.downcase
+          attributes['caption'] ||= document.attributes["#{admonition_name}-caption"]
         else
-          block = Block.new(parent, :example)
-          # hmmm, this assignment seems like a one-off
+          block = Block.new(parent, block_context)
           block.title = attributes['title']
-          if block.title? && attributes['caption'].nil?
-            attributes['caption'] = "Example #{document.counter('example-number')}. "
+          if block.title? && !attributes.has_key?('caption') && !block.attr?('caption')
+            number = document.counter('example-number')
+            attributes['caption'] = "#{document.attributes['example-caption']} #{number}. "
+            Document::AttributeEntry.new('example-number', number).save_to(attributes)
           end
         end
         buffer = Reader.new reader.grab_lines_until(:terminator => terminator)
 
-        while buffer.has_lines?
+        while buffer.has_more_lines?
           new_block = next_block(buffer, block)
           block.blocks << new_block unless new_block.nil?
         end
 
       # FIXME violates DRY w/ non-delimited block listing
-      elsif delimited_blk && (match = this_line.match(REGEXP[:listing]))
-        terminator = match[0]
-        AttributeList.rekey(attributes, ['style', 'language', 'linenums'])
+      elsif block_context == :listing || block_context == :fenced_code
+        if block_context == :fenced_code
+          attributes['style'] = 'source'
+          lang = this_line[3..-1].strip
+          attributes['language'] = lang unless lang.empty?
+          terminator = terminator[0..2] if terminator.length > 3
+        else
+          AttributeList.rekey(attributes, ['style', 'language', 'linenums'])
+        end
         buffer = reader.grab_lines_until(:terminator => terminator)
         buffer.last.chomp! unless buffer.empty?
         block = Block.new(parent, :listing, buffer)
+        block.title = attributes['title']
+        if document.attributes.has_key?('listing-caption') &&
+            block.title? && !attributes.has_key?('caption') && !block.attr?('caption')
+          number = document.counter('listing-number')
+          attributes['caption'] = "#{document.attributes['listing-caption']} #{number}. "
+          Document::AttributeEntry.new('listing-number', number).save_to(attributes)
+        end
 
-      elsif delimited_blk && (match = this_line.match(REGEXP[:quote]))
+      elsif block_context == :quote
         # multi-line verse or quote is surrounded by a block delimiter
-        terminator = match[0]
         AttributeList.rekey(attributes, ['style', 'attribution', 'citetitle'])
         quote_context = (attributes['style'] == 'verse' ? :verse : :quote)
         block_reader = Reader.new reader.grab_lines_until(:terminator => terminator)
 
-        # only quote can have other section elements (as as section block)
+        # only quote can have other section elements (as section block)
         section_body = (quote_context == :quote)
 
         if section_body
           block = Block.new(parent, quote_context)
-          while block_reader.has_lines?
+          while block_reader.has_more_lines?
             new_block = next_block(block_reader, block)
             block.blocks << new_block unless new_block.nil?
           end
         else
           block_reader.chomp_last!
           block = Block.new(parent, quote_context, block_reader.lines)
         end
 
-      elsif delimited_blk && (blk_ctx = [:literal, :pass].detect{|t| this_line.match(REGEXP[t])})
+      elsif block_context == :literal || block_context == :pass
         # literal is surrounded by '....' (4 or more '.' chars) lines
         # pass is surrounded by '++++' (4 or more '+' chars) lines
-        terminator = $~[0]
         buffer = reader.grab_lines_until(:terminator => terminator)
         buffer.last.chomp! unless buffer.empty?
         # a literal can masquerade as a listing
         if attributes[1] == 'listing'
-          blk_ctx = :listing
+          block_context = :listing
         end
-        block = Block.new(parent, blk_ctx, buffer)
+        block = Block.new(parent, block_context, buffer)
 
       elsif this_line.match(REGEXP[:lit_par])
         # literal paragraph is contiguous lines starting with
         # one or more space or tab characters
 
         # So we need to actually include this one in the grab_lines group
-        reader.unshift this_line
+        reader.unshift_line this_line
         buffer = reader.grab_lines_until(:preserve_last_line => true, :break_on_blank_lines => true) {|line|
           # labeled list terms can be indented, but a preceding blank indicates
           # we are in a list continuation and therefore literals should be strictly literal
           (context == :dlist && skipped == 0 && line.match(REGEXP[:dlist])) ||
-          delimited_block?(line)
+          is_delimited_block?(line)
         }
 
         # trim off the indentation equivalent to the size of the least indented line
         if !buffer.empty?
           offset = buffer.map {|line| line.match(REGEXP[:leading_blanks])[1].length }.min
@@ -475,39 +510,47 @@
           attributes['options'] << 'listparagraph'
         end
 
       ## these switches based on style need to come immediately before the else ##
 
-      elsif attributes[1] == 'source'
-        AttributeList.rekey(attributes, ['style', 'language', 'linenums'])
-        reader.unshift(this_line)
+      elsif attributes[1] == 'source' || attributes[1] == 'listing'
+        if attributes[1] == 'source'
+          AttributeList.rekey(attributes, ['style', 'language', 'linenums'])
+        end
+        reader.unshift_line this_line
         buffer = reader.grab_lines_until(:break_on_blank_lines => true)
         buffer.last.chomp! unless buffer.empty?
         block = Block.new(parent, :listing, buffer)
 
+      elsif attributes[1] == 'literal'
+        reader.unshift_line this_line
+        buffer = reader.grab_lines_until(:break_on_blank_lines => true)
+        buffer.last.chomp! unless buffer.empty?
+        block = Block.new(parent, :literal, buffer)
+
       elsif admonition_style = ADMONITION_STYLES.detect{|s| attributes[1] == s}
         # an admonition preceded by [<TYPE>] and lasts until a blank line
-        reader.unshift(this_line)
+        reader.unshift_line this_line
         buffer = reader.grab_lines_until(:break_on_blank_lines => true)
         buffer.last.chomp! unless buffer.empty?
         block = Block.new(parent, :admonition, buffer)
         attributes['style'] = admonition_style
-        attributes['name'] = admonition_style.downcase
-        attributes['caption'] ||= admonition_style.capitalize
+        attributes['name'] = admonition_name = admonition_style.downcase
+        attributes['caption'] ||= document.attributes["#{admonition_name}-caption"]
 
       elsif quote_context = [:quote, :verse].detect{|s| attributes[1] == s.to_s}
         # single-paragraph verse or quote is preceded by [verse] or [quote], respectively, and lasts until a blank line
         AttributeList.rekey(attributes, ['style', 'attribution', 'citetitle'])
-        reader.unshift(this_line)
+        reader.unshift_line this_line
         buffer = reader.grab_lines_until(:break_on_blank_lines => true)
         buffer.last.chomp! unless buffer.empty?
         block = Block.new(parent, quote_context, buffer)
 
       # a floating (i.e., discrete) title
       elsif ['float', 'discrete'].include?(attributes[1]) && is_section_title?(this_line, reader.peek_line)
         attributes['style'] = attributes[1]
-        reader.unshift this_line
+        reader.unshift_line this_line
         float_id, float_title, float_level, _ = parse_section_title reader
         block = Block.new(parent, :floating_title)
         if float_id.nil? || float_id.empty?
           # FIXME remove hack of creating throwaway Section to get at the generate_id method
           tmp_sect = Section.new(parent)
@@ -520,13 +563,13 @@
         block.level = float_level
         block.title = float_title
 
       # a paragraph - contiguous nonblank/noncontinuation lines
       else
-        reader.unshift this_line
+        reader.unshift_line this_line
         buffer = reader.grab_lines_until(:break_on_blank_lines => true, :preserve_last_line => true, :skip_line_comments => true) {|line|
-          delimited_block?(line) || line.match(REGEXP[:attr_line]) ||
+          is_delimited_block?(line) || line.match(REGEXP[:attr_line]) ||
           # next list item can be directly adjacent to paragraph of previous list item
           context == :dlist && line.match(REGEXP[:dlist])
           # not sure if there are any cases when we need this check for other list types
           #LIST_CONTEXTS.include?(context) && line.match(REGEXP[context])
         }
@@ -542,12 +585,12 @@
 
         if !options[:text] && (admonition = buffer.first.match(Regexp.new('^(' + ADMONITION_STYLES.join('|') + '):\s+')))
           buffer[0] = admonition.post_match
           block = Block.new(parent, :admonition, buffer)
           attributes['style'] = admonition[1]
-          attributes['name'] = admonition[1].downcase
-          attributes['caption'] ||= admonition[1].capitalize
+          attributes['name'] = admonition_name = admonition[1].downcase
+          attributes['caption'] ||= document.attributes["#{admonition_name}-caption"]
         else
           buffer.last.chomp!
           block = Block.new(parent, :paragraph, buffer)
         end
       end
@@ -576,21 +619,41 @@
   end
 
   # Public: Determines whether this line is the start of any of the delimited blocks
   #
   # returns the match data if this line is the first line of a delimited block or nil if not
-  #--
-  # TODO could use the match value as a lookup for the block type so we don't have
-  # to do any subsequent regexp
-  def self.delimited_block?(line)
-    # naive match
-    #line.match(REGEXP[:any_blk])
+  def self.is_delimited_block?(line, return_match_data = false)
+    line_len = line.length
+    # optimized for best performance
+    if line_len > 2
+      if line_len == 3
+        tip = line.chop
+        tl = 2
+      else
+        tip = line[0..3]
+        tl = 4
 
-    # attempt at better performance
-    if line.length > 0
-      # NOTE accessing the first element before calling ord is first Ruby 1.8.7 compat
-      REGEXP[:any_blk_ord].include?(line[0..0][0].ord) ? line.match(REGEXP[:any_blk]) : nil
+        # special case for fenced code blocks
+        tip_alt = tip.chop
+        if tip_alt == '```' || tip_alt == '~~~'
+          tip = tip_alt
+          tl = 3
+        end
+      end
+
+      if DELIMITED_BLOCKS.has_key? tip
+        # if tip is the full line
+        if tl == line_len - 1
+          return_match_data ? BlockMatchData.new(DELIMITED_BLOCKS[tip], tip, tip) : true
+        elsif match = line.match(REGEXP[:any_blk])
+          return_match_data ? BlockMatchData.new(DELIMITED_BLOCKS[tip], tip, match[0]) : true
+        else
+          nil
+        end
+      else
+        nil
+      end
     else
       nil
     end
   end
 
@@ -608,13 +671,13 @@
     if parent.context == list_type
       list_block.level = parent.level + 1
     else
       list_block.level = 1
     end
-    Asciidoctor.debug { "Created #{list_type} block: #{list_block}" }
+    Debug.debug { "Created #{list_type} block: #{list_block}" }
 
-    while reader.has_lines? && (match = reader.peek_line.match(REGEXP[list_type]))
+    while reader.has_more_lines? && (match = reader.peek_line.match(REGEXP[list_type]))
 
       marker = resolve_list_marker(list_type, match[1])
 
       # if we are moving to the next item, and the marker is different
       # determine if we are moving up or down in nesting
@@ -646,11 +709,11 @@
       end
 
       items << list_item unless list_item.nil?
       list_item = nil
 
-      reader.skip_blank
+      reader.skip_blank_lines
     end
 
     list_block
   end
 
@@ -705,11 +768,11 @@
     # that uses the same delimiter (::, :::, :::: or ;;)
     sibling_pattern = REGEXP[:dlist_siblings][match[2]]
 
     begin
       pairs << next_list_item(reader, block, match, sibling_pattern)
-    end while reader.has_lines? && match = reader.peek_line.match(sibling_pattern)
+    end while reader.has_more_lines? && match = reader.peek_line.match(sibling_pattern)
 
     block
   end
 
   # Internal: Parse and construct the next ListItem for the current bulleted
@@ -748,27 +811,32 @@
     end
 
     # first skip the line with the marker / term
     reader.get_line
     list_item_reader = Reader.new grab_lines_for_list_item(reader, list_type, sibling_trait, has_text)
-    if list_item_reader.has_lines?
+    if list_item_reader.has_more_lines?
       comment_lines = list_item_reader.consume_line_comments
       subsequent_line = list_item_reader.peek_line
       list_item_reader.unshift(*comment_lines) unless comment_lines.empty? 
 
       if !subsequent_line.nil?
         continuation_connects_first_block = (subsequent_line == "\n")
-        content_adjacent = !subsequent_line.strip.empty?
+        # if there's no continuation connecting the first block, then
+        # treat the lines as paragraph text (activated when has_text = false)
+        if !continuation_connects_first_block && list_type != :dlist
+          has_text = false
+        end
+        content_adjacent = !subsequent_line.chomp.empty?
       else
         continuation_connects_first_block = false
         content_adjacent = false
       end
 
       # only relevant for :dlist
       options = {:text => !has_text}
 
-      while list_item_reader.has_lines?
+      while list_item_reader.has_more_lines?
         new_block = next_block(list_item_reader, list_block, {}, options)
         list_item.blocks << new_block unless new_block.nil?
       end
 
       list_item.fold_first(continuation_connects_first_block, content_adjacent)
@@ -813,11 +881,11 @@
 
     # a detached continuation is a list continuation that follows a blank line
     # it gets associated with the outermost block
     detached_continuation = nil
 
-    while reader.has_lines?
+    while reader.has_more_lines?
       this_line = reader.get_line
 
       # if we've arrived at a sibling item in this list, we've captured
       # the complete list item and can begin processing it
       # the remainder of the method determines whether we've reached
@@ -844,17 +912,16 @@
         end
       end
 
       # a delimited block immediately breaks the list unless preceded
       # by a list continuation (they are harsh like that ;0)
-      if match = delimited_block?(this_line)
+      if match = is_delimited_block?(this_line, true)
         if continuation == :active
           buffer << this_line
           # grab all the lines in the block, leaving the delimiters in place
           # we're being more strict here about the terminator, but I think that's a good thing
-          terminator = match[0]
-          buffer.concat reader.grab_lines_until(:terminator => terminator, :grab_last_line => true)
+          buffer.concat reader.grab_lines_until(:terminator => match.terminator, :grab_last_line => true)
           continuation = :inactive
         else
           break
         end
       # technically attr_line only breaks if ensuing line is not a list item
@@ -866,22 +933,22 @@
           # literal paragraphs have special considerations (and this is one of 
           # two entry points into one)
           # if we don't process it as a whole, then a line in it that looks like a
           # list item will throw off the exit from it
           if this_line.match(REGEXP[:lit_par])
-            reader.unshift this_line
+            reader.unshift_line this_line
             buffer.concat reader.grab_lines_until(
               :preserve_last_line => true,
               :break_on_blank_lines => true,
               :break_on_list_continuation => true) {|line|
                 # we may be in an indented list disguised as a literal paragraph
                 # so we need to make sure we don't slurp up a legitimate sibling
                 list_type == :dlist && is_sibling_list_item?(line, list_type, sibling_trait)
             }
             continuation = :inactive
           # let block metadata play out until we find the block
-          elsif this_line.match(REGEXP[:blk_title]) || this_line.match(REGEXP[:attr_line])
+          elsif this_line.match(REGEXP[:blk_title]) || this_line.match(REGEXP[:attr_line]) || this_line.match(REGEXP[:attr_entry])
             buffer << this_line
           else
             if nested_list_type = (within_nested_list ? [:dlist] : NESTABLE_LIST_CONTEXTS).detect {|ctx| this_line.match(REGEXP[ctx]) }
               within_nested_list = true
               if nested_list_type == :dlist && $~[3].to_s.empty?
@@ -909,11 +976,11 @@
             # for all other lists, has_text is always true
             # in this block, we have to see whether we stay in the list
             if has_text
               # slurp up any literal paragraph offset by blank lines
               if this_line.match(REGEXP[:lit_par])
-                reader.unshift this_line
+                reader.unshift_line this_line
                 buffer.concat reader.grab_lines_until(
                   :preserve_last_line => true,
                   :break_on_blank_lines => true,
                   :break_on_list_continuation => true) {|line|
                     # we may be in an indented list disguised as a literal paragraph
@@ -953,24 +1020,25 @@
         end
       end
       this_line = nil
     end
 
-    reader.unshift this_line if !this_line.nil?
+    reader.unshift_line this_line if !this_line.nil?
 
     if detached_continuation
       buffer.delete_at detached_continuation
     end
 
     # strip trailing blank lines to prevent empty blocks
-    buffer.pop while !buffer.empty? && buffer.last.strip.empty?
+    buffer.pop while !buffer.empty? && buffer.last.chomp.empty?
 
     # We do need to replace the optional trailing continuation
     # a blank line would have served the same purpose in the document
     if !buffer.empty? && buffer.last.chomp == LIST_CONTINUATION
       buffer.pop
     end
+
     #puts "BUFFER[#{list_type},#{sibling_trait}]>#{buffer.join}<BUFFER"
     #puts "BUFFER[#{list_type},#{sibling_trait}]>#{buffer}<BUFFER"
 
     buffer
   end
@@ -995,28 +1063,33 @@
     end
 
     if attributes[1]
       section.sectname = attributes[1]
       section.special = true
-      if section.sectname == 'appendix'
-        attributes['caption'] ||= "Appendix #{parent.document.counter('appendix-number', 'A')}: "
+      document = parent.document
+      if section.sectname == 'appendix' &&
+          !attributes.has_key?('caption') &&
+          !document.attributes.has_key?('caption')
+        number = document.counter('appendix-number', 'A')
+        attributes['caption'] = "#{document.attributes['appendix-caption']} #{number}: "
+        Document::AttributeEntry.new('appendix-number', number).save_to(attributes)
       end
     else
       section.sectname = "sect#{section.level}"
     end
     section.update_attributes(attributes)
-    reader.skip_blank
+    reader.skip_blank_lines
 
     section
   end
 
   # Private: Get the Integer section level based on the characters
   # used in the ASCII line under the section title.
   #
   # line - the String line from under the section title.
   def self.section_level(line)
-    char = line.strip.chars.to_a.uniq
+    char = line.chomp.chars.to_a.uniq
     case char
     when ['=']; 0
     when ['-']; 1
     when ['~']; 2
     when ['^']; 3
@@ -1030,25 +1103,29 @@
     [line.length - 1, 0].max
   end
 
   # Internal: Checks if the next line on the Reader is a section title
   #
-  # reader - the source Reader
+  # reader     - the source Reader
+  # attributes - a Hash of attributes collected above the current line
   #
   # returns the section level if the Reader is positioned at a section title,
   # false otherwise
   def self.is_next_line_section?(reader, attributes)
     return false if !attributes[1].nil? && ['float', 'discrete'].include?(attributes[1])
-    if reader.has_lines?
-      line1 = reader.get_line
-      line2 = reader.peek_line
-      reader.unshift line1
-    else
-      return false
-    end
+    return false if !reader.has_more_lines?
+    is_section_title?(*reader.peek_lines(2))
+  end
 
-    is_section_title?(line1, line2)
+  # Internal: Convenience API for checking if the next line on the Reader is the document title
+  #
+  # reader     - the source Reader
+  # attributes - a Hash of attributes collected above the current line
+  #
+  # returns true if the Reader is positioned at the document title, false otherwise
+  def self.is_next_line_document_title?(reader, attributes)
+    is_next_line_section?(reader, attributes) == 0
   end
 
   # Public: Checks if these lines are a section title
   #
   # line1 - the first line as a String
@@ -1170,19 +1247,18 @@
   #
   #  parse_header_metadata(Reader.new ["Author Name <author@example.org>\n", "v1.0, 2012-12-21: Coincide w/ end of world.\n"])
   #  # => {'author' => 'Author Name', 'firstname' => 'Author', 'lastname' => 'Name', 'email' => 'author@example.org',
   #  #       'revnumber' => '1.0', 'revdate' => '2012-12-21', 'revremark' => 'Coincide w/ end of world.'}
   def self.parse_header_metadata(reader, document = nil)
-    # capture consecutive comment lines so we can reinsert them after the header
-    comment_lines = reader.consume_comments
+    # NOTE this will discard away any comment lines, but not skip blank lines
+    process_attribute_entries(reader, document)
 
-    metadata = !document.nil? ? document.attributes : {}
-    author_initials = metadata['authorinitials']
-    if reader.has_lines? && !reader.peek_line.strip.empty?
+    metadata = {}
+
+    if reader.has_more_lines? && !reader.peek_line.chomp.empty?
       author_line = reader.get_line
-      match = author_line.match(REGEXP[:author_info])
-      if match
+      if match = author_line.match(REGEXP[:author_info])
         metadata['firstname'] = fname = match[1].tr('_', ' ')
         metadata['author'] = fname
         metadata['authorinitials'] = fname[0, 1]
         if !match[2].nil? && !match[3].nil?
           metadata['middlename'] = mname = match[2].tr('_', ' ')
@@ -1198,32 +1274,40 @@
       else
         metadata['author'] = metadata['firstname'] = author_line.strip.squeeze(' ')
         metadata['authorinitials'] = metadata['firstname'][0, 1]
       end
 
-      # hack because of incorrect order of attribute processing
-      metadata['authorinitials'] = author_initials unless author_initials.nil?
+      # NOTE this will discard away any comment lines, but not skip blank lines
+      process_attribute_entries(reader, document)
 
-      # capture consecutive comment lines so we can reinsert them after the header
-      comment_lines += reader.consume_comments
-
-      if reader.has_lines? && !reader.peek_line.strip.empty?
+      if reader.has_more_lines? && !reader.peek_line.chomp.empty?
         rev_line = reader.get_line 
-        match = rev_line.match(REGEXP[:revision_info])
-        if match
-          metadata['revdate'] = match[2]
-          metadata['revnumber'] = match[1] unless match[1].nil?
-          metadata['revremark'] = match[3] unless match[3].nil?
+        if match = rev_line.match(REGEXP[:revision_info])
+          metadata['revdate'] = match[2].strip
+          metadata['revnumber'] = match[1].rstrip unless match[1].nil?
+          metadata['revremark'] = match[3].rstrip unless match[3].nil?
         else
-          metadata['revdate'] = rev_line.strip
+          # throw it back
+          reader.unshift_line rev_line
         end
       end
 
-      reader.skip_blank
+      # NOTE this will discard away any comment lines, but not skip blank lines
+      process_attribute_entries(reader, document)
+
+      reader.skip_blank_lines
+
+      # apply header subs and assign to document
+      if !document.nil?
+        metadata.map do |key, val|
+          val = document.apply_header_subs(val)
+          document.attributes[key] = val if !document.attributes.has_key?(key)
+          val
+        end
+      end
     end
 
-    reader.unshift(*comment_lines)
     metadata
   end
 
   # Internal: Parse lines of metadata until a line of metadata is not found.
   #
@@ -1239,11 +1323,11 @@
   #
   # returns the Hash of attributes including any metadata found
   def self.parse_block_metadata_lines(reader, parent, attributes = {}, options = {})
     while parse_block_metadata_line(reader, parent, attributes, options)
       # discard the line just processed
-      reader.next_line
+      reader.advance
       reader.skip_blank_lines
     end
     attributes
   end
 
@@ -1266,19 +1350,19 @@
   #              *  :text indicates that lexer is only looking for text content
   #                   and thus the block title should not be captured
   #
   # returns true if the line contains metadata, otherwise false
   def self.parse_block_metadata_line(reader, parent, attributes, options = {})
-    return false if !reader.has_lines?
+    return false if !reader.has_more_lines?
     next_line = reader.peek_line
-    if next_line.match(REGEXP[:comment])
-      # do nothing, we'll skip it
-    # QUESTION should we parse block comments here instead of next_block?
-    # disable until we can agree what the current line is coming in
-    elsif match = next_line.match(REGEXP[:comment_blk])
+    if (commentish = next_line.start_with?('//')) && (match = next_line.match(REGEXP[:comment_blk]))
       terminator = match[0]
-      reader.grab_lines_until(:skip_first_line => true, :preserve_last_line => true, :terminator => terminator)
+      reader.grab_lines_until(:skip_first_line => true, :preserve_last_line => true, :terminator => terminator, :preprocess => false)
+    elsif commentish && next_line.match(REGEXP[:comment])
+      # do nothing, we'll skip it
+    elsif !options[:text] && (match = next_line.match(REGEXP[:attr_entry]))
+      process_attribute_entry(reader, parent, attributes, match)
     elsif match = next_line.match(REGEXP[:anchor])
       id, reftext = match[1].split(',')
       attributes['id'] = id
       # AsciiDoc always use [id] as the reftext in HTML output,
       # but I'd like to do better in Asciidoctor
@@ -1288,21 +1372,71 @@
         parent.document.register(:ids, [id, reftext])
       end
     elsif match = next_line.match(REGEXP[:blk_attr_list])
       AttributeList.new(parent.document.sub_attributes(match[1]), parent.document).parse_into(attributes)
     # NOTE title doesn't apply to section, but we need to stash it for the first block
-    # TODO need test for this getting passed on to first block after section if found above section
     # TODO should issue an error if this is found above the document title
     elsif !options[:text] && (match = next_line.match(REGEXP[:blk_title]))
       attributes['title'] = match[1]
     else
       return false
     end
 
     true
   end
 
+  def self.process_attribute_entries(reader, parent, attributes = nil)
+    reader.skip_comment_lines
+    while process_attribute_entry(reader, parent, attributes)
+      # discard line just processed
+      reader.advance
+      reader.skip_comment_lines
+    end
+  end
+
+  def self.process_attribute_entry(reader, parent, attributes = nil, match = nil)
+    match ||= reader.has_more_lines? ? reader.peek_line.match(REGEXP[:attr_entry]) : nil
+    if match
+      name = match[1]
+      value = match[2].nil? ? '' : match[2]
+      if value.end_with? LINE_BREAK
+        value.chop!.rstrip!
+        while reader.advance
+          next_line = reader.peek_line.strip
+          break if next_line.empty?
+          if next_line.end_with? LINE_BREAK
+            value = "#{value} #{next_line.chop.rstrip}"
+          else
+            value = "#{value} #{next_line}"
+            break
+          end
+        end
+      end
+
+      if name.end_with?('!')
+        # a nil value signals the attribute should be deleted (undefined)
+        value = nil
+        name = name.chop
+      end
+
+      name = sanitize_attribute_name(name)
+      accessible = true
+      if !parent.nil?
+        accessible = value.nil? ?
+            parent.document.delete_attribute(name) :
+            parent.document.set_attribute(name, value)
+      end
+
+      if !attributes.nil?
+        Document::AttributeEntry.new(name, value).save_to(attributes) if accessible
+      end
+      true
+    else
+      false
+    end
+  end
+
   # Internal: Resolve the 0-index marker for this list item
   #
   # For ordered lists, match the marker used for this list item against the
   # known list markers and determine which marker is the first (0-index) marker
   # in its number series.
@@ -1385,10 +1519,11 @@
         end
         marker = 'I)'
     end
 
     if validate && expected != actual
+      # FIXME I need a reader reference or line number to report line number
       puts "asciidoctor: WARNING: list item index: expected #{expected}, got #{actual}"
     end
 
     marker
   end
@@ -1439,12 +1574,12 @@
       explicit_col_specs = false
     end
 
     table_reader.skip_blank_lines
 
-    parser_ctx = Asciidoctor::Table::ParserContext.new(table, attributes)
-    while table_reader.has_lines?
+    parser_ctx = Table::ParserContext.new(table, attributes)
+    while table_reader.has_more_lines?
       line = table_reader.get_line
 
       if parser_ctx.format == 'psv'
         if parser_ctx.starts_with_delimiter? line
           line = line[1..-1]
@@ -1505,11 +1640,11 @@
         end
       end
 
       table_reader.skip_blank_lines unless parser_ctx.cell_open?
 
-      if !table_reader.has_lines?
+      if !table_reader.has_more_lines?
         parser_ctx.close_cell true
       end
     end
 
     table.attributes['colcount'] ||= parser_ctx.col_count
@@ -1595,11 +1730,11 @@
     spec = (pos == :end ? {} : nil)
     rest = line
 
     if m = line.match(REGEXP[:table_cellspec][pos]) 
       spec = {}
-      return [spec, line] if m[0].strip.empty?
+      return [spec, line] if m[0].chomp.empty?
       rest = (pos == :start ? m.post_match : m.pre_match)
       if m[1]
         colspec, rowspec = m[1].split '.'
         colspec = colspec.to_s.empty? ? 1 : colspec.to_i
         rowspec = rowspec.to_s.empty? ? 1 : rowspec.to_i
@@ -1627,10 +1762,30 @@
     end 
 
     [spec, rest]
   end
 
+  # Public: Convert a string to a legal attribute name.
+  #
+  # name  - the String name of the attribute
+  #
+  # Returns a String with the legal AsciiDoc attribute name.
+  #
+  # Examples
+  #
+  #   sanitize_attribute_name('Foo Bar')
+  #   => 'foobar'
+  #
+  #   sanitize_attribute_name('foo')
+  #   => 'foo'
+  #
+  #   sanitize_attribute_name('Foo 3 #-Billy')
+  #   => 'foo3-billy'
+  def self.sanitize_attribute_name(name)
+    name.gsub(REGEXP[:illegal_attr_name_chars], '').downcase
+  end
+
   # Internal: Converts a Roman numeral to an integer value.
   #
   # value - The String Roman numeral to convert
   #
   # Returns the Integer for this Roman numeral
@@ -1648,6 +1803,7 @@
       end
     }
 
     result
   end
+end
 end