# regenerate parser.rb using `tt parser.treetop` module Asciidoctor module Pdf module FormattedText grammar Markup rule text complex end rule complex (cdata / element / entity)* { def content elements.map {|e| e.content } end } end rule element # strict tag matching (costs a minor toll) # void_element / start_tag complex end_tag &{|seq| seq[0].name == seq[2].name } { void_element / start_tag complex end_tag { # NOTE content only applies to non-void elements (second part of rule) def content { type: :element, name: (tag_element = elements[0]).name.to_sym, attributes: tag_element.attributes, pcdata: elements[1].content } end } end rule void_element '<' void_tag_name attributes (spaces? '/')? '>' { def content { type: :element, name: elements[1].text_value.to_sym, attributes: elements[2].content } end } end rule start_tag '<' tag_name attributes '>' { def name elements[1].text_value end def attributes elements[2].content end } end rule tag_name # QUESTION faster to do regex? # QUESTION can we cut stuff we aren't using? what about supporting hr? #'a' / 'b' / 'code' / 'color' / 'del' / 'em' / 'font' / 'i' / 'img' / 'link' / 'span' / 'strikethrough' / 'strong' / 'sub' / 'sup' / 'u' 'a' / 'code' / 'color' / 'del' / 'em' / 'font' / 'span' / 'strong' / 'sub' / 'sup' end rule void_tag_name 'br' / 'img' end rule attributes attribute* { def content attrs = {} elements.each {|e| attr_name, attr_val = e.content attrs[attr_name.to_sym] = attr_val } attrs end } end rule attribute spaces [a-z_]+ '=' '"' [^"]* '"' { def content [elements[1].text_value, elements[4].text_value] end } end rule end_tag '' { def name elements[1].text_value end } end rule cdata [^<&]+ { def content { type: :text, value: text_value } end } end rule entity '&' ('#' entity_number / entity_name) ';' { def content if (entity_value = elements[1]).terminal? { type: :entity, name: entity_value.text_value.to_sym } else { type: :entity, number: entity_value.elements[1].text_value.to_i } end end } end rule entity_number [0-9] 2..4 end rule entity_name 'amp' / 'apos' / 'gt' / 'lt' / 'quot' end rule spaces ' '+ end end end end end