lexer.rb in asciidoctor-0.0.6

- old
+ new

@@ -26,10 +26,11 @@
   def self.next_block(reader, parent = self)
     # Skip ahead to the block content
     reader.skip_blank
 
     return nil unless reader.has_lines?
+    context = parent.is_a?(Block) ? parent.context : nil
 
     # NOTE: An anchor looks like this:
     #   [[foo]]
     # with the inside [foo] (including brackets) as match[1]
     if match = reader.peek_line.match(REGEXP[:anchor])
@@ -43,10 +44,15 @@
       reader.get_line
     else
       anchor = nil
     end
 
+    # skip a list continuation character if we're processing a list
+    if LIST_CONTEXTS.include?(context)
+      reader.skip_list_continuation
+    end
+
     Asciidoctor.debug "/"*64
     Asciidoctor.debug "#{File.basename(__FILE__)}:#{__LINE__} -> #{__method__} - First two lines are:"
     Asciidoctor.debug reader.peek_line
     tmp_line = reader.get_line
     Asciidoctor.debug reader.peek_line
@@ -54,48 +60,50 @@
     Asciidoctor.debug "/"*64
 
     block = nil
     title = nil
     caption = nil
-    source_type = nil
     buffer = []
+    attributes = {}
+    context = parent.is_a?(Block) ? parent.context : nil
     while reader.has_lines? && block.nil?
       buffer.clear
       this_line = reader.get_line
       next_line = reader.peek_line || ''
 
       if this_line.match(REGEXP[:comment_blk])
         Reader.new(reader.grab_lines_until {|line| line.match( REGEXP[:comment_blk] ) })
-        next
 
       elsif this_line.match(REGEXP[:comment])
-        next
-
-      elsif match = this_line.match(REGEXP[:title])
-        title = match[1]
         reader.skip_blank
 
-      elsif match = this_line.match(REGEXP[:listing_source])
-        source_type = match[1]
+      elsif match = this_line.match(REGEXP[:attr_list_blk])
+        collect_attributes(match[1], attributes)
         reader.skip_blank
 
-      elsif match = this_line.match(REGEXP[:caption])
-        caption = match[1]
-
       elsif is_section_heading?(this_line, next_line)
         # If we've come to a new section, then we've found the end of this
         # current block.  Likewise if we'd found an unassigned anchor, push
-        # it back as well, so it can go with this next heading.
-        # NOTE - I don't think this will assign the anchor properly. Anchors
-        # only match with double brackets - [[foo]], but what's stored in
-        # `anchor` at this point is only the `foo` part that was stripped out
-        # after matching.  TODO: Need a way to test this.
+        #
+        # FIXME when slurping up next section, give back trailing anchor to following section
         reader.unshift(this_line)
-        reader.unshift(anchor) unless anchor.nil?
         Asciidoctor.debug "#{__method__}: SENDING to next_section with lines[0] = #{reader.peek_line}"
         block = next_section(reader, parent)
 
+      elsif match = this_line.match(REGEXP[:title])
+        title = match[1]
+        reader.skip_blank
+
+      elsif match = this_line.match(REGEXP[:image_blk])
+        collect_attributes(match[2], attributes, ['alt', 'width', 'height'])
+        block = Block.new(parent, :image)
+        # FIXME this seems kind of one-off here
+        target = block.sub_attributes(match[1])
+        attributes['target'] = target
+        attributes['alt'] ||= File.basename(target, File.extname(target))
+        reader.skip_blank
+
       elsif this_line.match(REGEXP[:oblock])
         # oblock is surrounded by '--' lines and has zero or more blocks inside
         buffer = Reader.new(reader.grab_lines_until { |line| line.match(REGEXP[:oblock]) })
 
         # Strip lines off end of block - not implemented yet
@@ -107,29 +115,38 @@
         while buffer.has_lines?
           new_block = next_block(buffer, block)
           block.blocks << new_block unless new_block.nil?
         end
 
+      # needs to come before list detection
+      elsif this_line.match(REGEXP[:sidebar_blk])
+        # sidebar is surrounded by '****' (4 or more '*' chars) lines
+        # FIXME violates DRY because it's a duplication of quote parsing
+        block = Block.new(parent, :sidebar)
+        buffer = Reader.new(reader.grab_lines_until {|line| line.match( REGEXP[:sidebar_blk] ) })
+
+        while buffer.has_lines?
+          new_block = next_block(buffer, block)
+          block.blocks << new_block unless new_block.nil?
+        end
+
       elsif list_type = [:olist, :colist].detect{|l| this_line.match( REGEXP[l] )}
         items = []
         Asciidoctor.debug "Creating block of type: #{list_type}"
         block = Block.new(parent, list_type)
+        attributes['style'] ||= 'arabic'
         while !this_line.nil? && match = this_line.match(REGEXP[list_type])
-          item = ListItem.new
+          item = ListItem.new(block)
 
           reader.unshift match[2].lstrip.sub(/^\./, '\.')
           item_segment = Reader.new(list_item_segment(reader, :alt_ending => REGEXP[list_type]))
           while item_segment.has_lines?
             new_block = next_block(item_segment, block)
             item.blocks << new_block unless new_block.nil?
           end
 
-          if item.blocks.any? &&
-             item.blocks.first.is_a?(Block) &&
-             (item.blocks.first.context == :paragraph || item.blocks.first.context == :literal)
-            item.content = item.blocks.shift.buffer.map{|l| l.strip}.join("\n")
-          end
+          item.fold_first
 
           items << item
 
           reader.skip_blank
 
@@ -138,121 +155,173 @@
         reader.unshift(this_line) unless this_line.nil?
 
         block.buffer = items
 
       elsif match = this_line.match(REGEXP[:ulist])
-
         reader.unshift(this_line)
         block = build_ulist(reader, parent)
 
       elsif match = this_line.match(REGEXP[:dlist])
+        # TODO build_dlist method?
         pairs = []
         block = Block.new(parent, :dlist)
+        # allows us to capture until we find a labeled item using the same delimiter (::, :::, :::: or ;;)
+        sibling_matcher = REGEXP[:dlist_siblings][match[3]]
 
-        this_dlist = Regexp.new(/^#{match[1]}(.*)#{match[3]}\s*$/)
+        begin
+          dt = ListItem.new(block, match[2])
+          dt.anchor = match[1] unless match[1].nil?
+          dd = ListItem.new(block, match[5])
 
-        while !this_line.nil? && match = this_line.match(this_dlist)
-          if anchor = match[1].match( /\[\[([^\]]+)\]\]/ )
-            dt = ListItem.new( $` + $' )
-            dt.anchor = anchor[1]
-          else
-            dt = ListItem.new( match[1] )
-          end
-          dd = ListItem.new
-          # workaround eg. git-config OPTIONS --get-colorbool
-          reader.get_line if reader.has_lines? && reader.peek_line.strip.empty?
-
-          dd_segment = Reader.new(list_item_segment(reader, :alt_ending => this_dlist))
+          dd_segment = Reader.new(list_item_segment(reader, :alt_ending => sibling_matcher))
           while dd_segment.has_lines?
             new_block = next_block(dd_segment, block)
             dd.blocks << new_block unless new_block.nil?
           end
 
-          if dd.blocks.any? &&
-             dd.blocks.first.is_a?(Block) &&
-             (dd.blocks.first.context == :paragraph || dd.blocks.first.context == :literal)
-            dd.content = dd.blocks.shift.buffer.map{|l| l.strip}.join("\n")
-          end
+          dd.fold_first
 
           pairs << [dt, dd]
 
+          # this skip_blank might be redundant
           reader.skip_blank
-
           this_line = reader.get_line
-        end
+        end while !this_line.nil? && match = this_line.match(sibling_matcher)
+
         reader.unshift(this_line) unless this_line.nil?
         block.buffer = pairs
+    
+      # FIXME violates DRY because it's a duplication of other block parsing
+      elsif this_line.match(REGEXP[:example])
+        # example is surrounded by lines with 4 or more '=' chars
+        rekey_positional_attributes(attributes, ['style'])
+        if admonition_style = ADMONITION_STYLES.detect {|s| attributes['style'] == s}
+          block = Block.new(parent, :admonition)
+          attributes['name'] = admonition_style.downcase
+          attributes['caption'] ||= admonition_style.capitalize
+        else
+          block = Block.new(parent, :example)
+        end
+        buffer = Reader.new(reader.grab_lines_until {|line| line.match( REGEXP[:example] ) })
 
-      elsif this_line.match(REGEXP[:verse])
-        # verse is preceded by [verse] and lasts until a blank line
-        buffer = reader.grab_lines_until(:break_on_blank_lines => true)
-        block = Block.new(parent, :verse, buffer)
+        while buffer.has_lines?
+          new_block = next_block(buffer, block)
+          block.blocks << new_block unless new_block.nil?
+        end
 
-      elsif this_line.match(REGEXP[:note])
-        # note is an admonition preceded by [NOTE] and lasts until a blank line
-        buffer = reader.grab_lines_until(:break_on_blank_lines => true)
-        block = Block.new(parent, :note, buffer)
+      # FIXME violates DRY w/ non-delimited block listing
+      elsif this_line.match(REGEXP[:listing])
+        rekey_positional_attributes(attributes, ['style', 'language', 'linenums'])
+        buffer = reader.grab_lines_until {|line| line.match( REGEXP[:listing] )}
+        buffer.last.chomp! unless buffer.empty?
+        block = Block.new(parent, :listing, buffer)
 
-      elsif block_type = [:listing, :example].detect{|t| this_line.match( REGEXP[t] )}
-        buffer = reader.grab_lines_until {|line| line.match( REGEXP[block_type] )}
-        block = Block.new(parent, block_type, buffer)
-
-      elsif this_line.match( REGEXP[:quote] )
-        block = Block.new(parent, :quote)
+      elsif this_line.match(REGEXP[:quote])
+        # multi-line verse or quote is surrounded by a block delimiter
+        rekey_positional_attributes(attributes, ['style', 'attribution', 'citetitle'])
+        quote_context = (attributes['style'] == 'verse' ? :verse : :quote)
         buffer = Reader.new(reader.grab_lines_until {|line| line.match( REGEXP[:quote] ) })
 
-        while buffer.has_lines?
-          new_block = next_block(buffer, block)
-          block.blocks << new_block unless new_block.nil?
+        # only quote can have other section elements (as as section block)
+        section_body = (quote_context == :quote)
+
+        if section_body
+          block = Block.new(parent, quote_context)
+          while buffer.has_lines?
+            new_block = next_block(buffer, block)
+            block.blocks << new_block unless new_block.nil?
+          end
+        else
+          block = Block.new(parent, quote_context, buffer.lines)
         end
 
       elsif this_line.match(REGEXP[:lit_blk])
         # example is surrounded by '....' (4 or more '.' chars) lines
         buffer = reader.grab_lines_until {|line| line.match( REGEXP[:lit_blk] ) }
+        buffer.last.chomp! unless buffer.empty?
         block = Block.new(parent, :literal, buffer)
 
       elsif this_line.match(REGEXP[:lit_par])
         # literal paragraph is contiguous lines starting with
         # one or more space or tab characters
 
         # So we need to actually include this one in the grab_lines group
         reader.unshift this_line
-        buffer = reader.grab_lines_until(:preserve_last_line => true) {|line| ! line.match( REGEXP[:lit_par] ) }
+        buffer = reader.grab_lines_until(:preserve_last_line => true) {|line|
+          (context == :dlist && line.match(REGEXP[:dlist])) || !line.match(REGEXP[:lit_par])
+        }
 
+        # trim off the indentation that put us in this literal paragraph
+        if !buffer.empty? && match = buffer.first.match(/^([[:blank:]]+)/)
+          offset = match[1].length
+          buffer = buffer.map {|l| l.slice(offset..-1)}
+          buffer.last.chomp!
+        end
+
         block = Block.new(parent, :literal, buffer)
 
-      elsif this_line.match(REGEXP[:sidebar_blk])
-        # example is surrounded by '****' (4 or more '*' chars) lines
-        buffer = reader.grab_lines_until {|line| line.match( REGEXP[:sidebar_blk] ) }
-        block = Block.new(parent, :sidebar, buffer)
+      ## these switches based on style need to come immediately before the else ##
 
+      elsif attributes[0] == 'source'
+        rekey_positional_attributes(attributes, ['style', 'language', 'linenums'])
+        reader.unshift(this_line)
+        buffer = reader.grab_lines_until(:break_on_blank_lines => true)
+        buffer.last.chomp! unless buffer.empty?
+        block = Block.new(parent, :listing, buffer)
+
+      elsif admonition_style = ADMONITION_STYLES.detect{|s| attributes[0] == s}
+        # an admonition preceded by [*TYPE*] and lasts until a blank line
+        reader.unshift(this_line)
+        buffer = reader.grab_lines_until(:break_on_blank_lines => true)
+        block = Block.new(parent, :admonition, buffer)
+        attributes['style'] = admonition_style
+        attributes['name'] = admonition_style.downcase
+        attributes['caption'] ||= admonition_style.capitalize
+
+      elsif quote_context = [:quote, :verse].detect{|s| attributes[0] == s.to_s}
+        # single-paragraph verse or quote is preceded by [verse] or [quote], respectively, and lasts until a blank line
+        rekey_positional_attributes(attributes, ['style', 'attribution', 'citetitle'])
+        reader.unshift(this_line)
+        buffer = reader.grab_lines_until(:break_on_blank_lines => true)
+        block = Block.new(parent, quote_context, buffer)
+
       else
         # paragraph is contiguous nonblank/noncontinuation lines
-        while !this_line.nil? && !this_line.strip.empty?
-          if this_line.match( REGEXP[:listing] ) || this_line.match( REGEXP[:oblock] )
-            reader.unshift this_line
-            break
-          end
-          buffer << this_line
-          this_line = reader.get_line
+        reader.unshift this_line
+        buffer = reader.grab_lines_until(:break_on_blank_lines => true, :preserve_last_line => true) {|line|
+          (context == :dlist && line.match(REGEXP[:dlist])) ||
+          ([:ulist, :olist, :dlist].include?(context) && line.chomp == LIST_CONTINUATION) ||
+          line.match(REGEXP[:oblock])
+        }
+
+        if LIST_CONTEXTS.include?(context)
+          reader.skip_list_continuation
         end
 
-        if buffer.any? && admonition = buffer.first.match(/^NOTE:\s*/)
+        if !buffer.empty? && admonition = buffer.first.match(Regexp.new('^(' + ADMONITION_STYLES.join('|') + '):\s+'))
           buffer[0] = admonition.post_match
-          block = Block.new(parent, :note, buffer)
-        elsif source_type
-          block = Block.new(parent, :listing, buffer)
+          block = Block.new(parent, :admonition, buffer)
+          attributes['style'] = admonition[1]
+          attributes['name'] = admonition[1].downcase
+          attributes['caption'] ||= admonition[1].capitalize
         else
+          buffer.last.chomp! unless buffer.empty?
           Asciidoctor.debug "Proud parent #{parent} getting a new paragraph with buffer: #{buffer}"
           block = Block.new(parent, :paragraph, buffer)
         end
       end
     end
 
-    block.anchor  ||= anchor
-    block.title   ||= title
-    block.caption ||= caption
+    # when looking for nested content, a series of
+    # line comments or a comment block could leave us
+    # without a block
+    if !block.nil?
+      block.anchor   ||= (anchor || attributes['id'])
+      block.title    ||= title
+      block.caption  ||= caption
+      block.update_attributes(attributes)
+    end
 
     block
   end
 
   # Private: Return the Array of lines constituting the next list item
@@ -350,11 +419,11 @@
       segment << this_line
     end
 
     Asciidoctor.debug "*"*40
     Asciidoctor.debug "#{File.basename(__FILE__)}:#{__LINE__} -> #{__method__}: Returning this:"
-    Asciidoctor.debug segment.inspect
+    #Asciidoctor.debug segment.inspect
     Asciidoctor.debug "*"*10
     Asciidoctor.debug "Leaving #{__method__}: Top of reader queue is:"
     Asciidoctor.debug reader.peek_line
     Asciidoctor.debug "*"*40
     segment
@@ -381,15 +450,16 @@
       return nil
     end
 
     level = match[1].length
 
-    list_item = ListItem.new
+    list_item = ListItem.new(block)
     list_item.level = level
     Asciidoctor.debug "#{__FILE__}:#{__LINE__}: Created ListItem #{list_item} with match[2]: #{match[2]} and level: #{list_item.level}"
 
-    # Prevent bullet list text starting with . from being treated as a paragraph
+    # Restore first line of list item
+    # Also prevent bullet list text starting with . from being treated as a paragraph
     # title or some other unseemly thing in list_item_segment. I think. (NOTE)
     reader.unshift match[2].lstrip.sub(/^\./, '\.')
 
     item_segment = Reader.new(list_item_segment(reader, :alt_ending => REGEXP[list_type]))
 #    item_segment = list_item_segment(reader)
@@ -398,16 +468,11 @@
       list_item.blocks << new_block unless new_block.nil?
     end
 
     Asciidoctor.debug "\n\nlist_item has #{list_item.blocks.count} blocks, and first is a #{list_item.blocks.first.class} with context #{list_item.blocks.first.context rescue 'n/a'}\n\n"
 
-    first_block = list_item.blocks.first
-    if first_block.is_a?(Block) &&
-       (first_block.context == :paragraph || first_block.context == :literal)
-      list_item.content = first_block.buffer.map{|l| l.strip}.join("\n")
-      list_item.blocks.shift
-    end
+    list_item.fold_first
 
     list_item
   end
 
   def self.build_ulist(reader, parent = nil)
@@ -421,19 +486,22 @@
 
       this_item_level = match[1].length
 
       if first_item_level && first_item_level < this_item_level
         # If this next :uline level is down one from the
-        # current Block's, put it in a Block of its own
-        list_item = next_block(reader, block)
+        # current Block's, append it to content of the current list item
+        items.last.blocks << next_block(reader, block)
+      elsif first_item_level && first_item_level > this_item_level
+        break
       else
         list_item = build_ulist_item(reader, block, match)
         # Set the base item level for this Block
         first_item_level ||= list_item.level
       end
 
-      items << list_item
+      items << list_item unless list_item.nil?
+      list_item = nil
 
       reader.skip_blank
     end
 
     block.buffer = items
@@ -449,27 +517,22 @@
     this_line = lines.shift
 
     while this_line && match = this_line.match(REGEXP[list_type])
       level = match[1].length
 
-      list_item = ListItem.new
+      list_item = ListItem.new(block)
       list_item.level = level
       Asciidoctor.debug "Created ListItem #{list_item} with match[2]: #{match[2]} and level: #{list_item.level}"
 
       lines.unshift match[2].lstrip.sub(/^\./, '\.')
       item_segment = list_item_segment(lines, :alt_ending => REGEXP[list_type], :list_level => level)
       while item_segment.any?
         new_block = next_block(item_segment, block)
         list_item.blocks << new_block unless new_block.nil?
       end
 
-      first_block = list_item.blocks.first
-      if first_block.is_a?(Block) &&
-         (first_block.context == :paragraph || first_block.context == :literal)
-        list_item.content = first_block.buffer.map{|l| l.strip}.join("\n")
-        list_item.blocks.shift
-      end
+      list_item.fold_first
 
       if items.any? && (level > items.last.level)
         Asciidoctor.debug "--> Putting this new level #{level} ListItem under my pops, #{items.last} (level: #{items.last.level})"
         items.last.blocks << list_item
       else
@@ -488,10 +551,36 @@
 
     block.buffer = items
     block
   end
 
+  def self.collect_attributes(attrs, attributes, posattrs = [])
+    # TODO walk be properly rather than using split
+    attrs.split(/\s*,\s*/).each_with_index do |entry, i|
+      key, val = entry.split(/\s*=\s*/) 
+      if !val.nil?
+        val.gsub!(/^(['"])(.*)\1$/, '\2') unless val.nil?
+        attributes[key] = val
+      else
+        attributes[i] = key
+        # positional attribute has a known key
+        if posattrs.size >= (i + 1)
+          attributes[posattrs[i]] = key
+        end 
+      end
+    end
+  end
+
+  def self.rekey_positional_attributes(attributes, posattrs)
+    posattrs.each_with_index do |key, i|
+      val = attributes[i]
+      if !val.nil?
+        attributes[key] = val
+      end
+    end
+  end
+
   # Private: Get the Integer section level based on the characters
   # used in the ASCII line under the section name.
   #
   # line - the String line from under the section name.
   def self.section_level(line)
@@ -515,11 +604,12 @@
   end
 
   def self.is_two_line_section_heading?(line1, line2)
     !line1.nil? && !line2.nil? &&
     line1.match(REGEXP[:name]) && line2.match(REGEXP[:line]) &&
-    (line1.size - line2.size).abs <= 1
+    # chomp so that a (non-visible) endline does not impact calculation
+    (line1.chomp.size - line2.chomp.size).abs <= 1
   end
 
   def self.is_section_heading?(line1, line2 = nil)
     is_single_line_section_heading?(line1) ||
     is_two_line_section_heading?(line1, line2)
@@ -645,16 +735,10 @@
           break
         else
           section_lines << this_line
           section_lines << reader.get_line unless is_single_line_section_heading?(this_line)
         end
-      elsif this_line.match(REGEXP[:listing])
-        section_lines << this_line
-        section_lines.concat reader.grab_lines_until {|line| line.match( REGEXP[:listing] ) }
-        # Also grab the last line, if there is one
-        this_line = reader.get_line
-        section_lines << this_line unless this_line.nil?
       else
         section_lines << this_line
       end
     end
 
@@ -664,9 +748,22 @@
       section_reader.skip_blank
 
       if section_reader.has_lines?
         new_block = next_block(section_reader, section)
         section << new_block unless new_block.nil?
+      end
+    end
+
+    # detect preamble and push it into a block
+    # QUESTION make this an operation on Section?
+    if section.level == 0
+      blocks = section.blocks.take_while {|b| !b.is_a? Section}
+      if !blocks.empty?
+        # QUESTION Should we propagate the buffer?
+        #preamble = Block.new(section, :preamble, blocks.reduce {|a, b| a.buffer + b.buffer})
+        preamble = Block.new(section, :preamble)
+        blocks.each { preamble << section.delete_at(0) }
+        section.insert(0, preamble)
       end
     end
 
     section
   end