require 'strscan' module Volt class HTMLParseError < RuntimeError end # Parses html and bindings # based on http://ejohn.org/files/htmlparser.js # # takes the html and a handler object that will have the following methods # called as each is seen: comment, text, binding, start_tag, end_tag # # This is not a full html parser, but should cover most common cases. class SandlebarsParser def self.truth_hash(array) hash = {} array.each { |v| hash[v] = true } return hash end # regex matchers START_TAG = /^<([-!\:A-Za-z0-9_]+)((?:\s+[\w\-]+(?:\s*=\s*(?:(?:"[^"]*")|(?:'[^']*')|[^>\s]+))?)*)\s*(\/?)>/ END_TAG = /^<\/([-!\:A-Za-z0-9_]+)[^>]*>/ ATTRIBUTES = /([-\:A-Za-z0-9_]+)(?:\s*=\s*(?:(?:"((?:\\.|[^"])*)")|(?:'((?:\\.|[^'])*)')|([^>\s]+)))?/ # Types of elements BLOCK = truth_hash(%w{a address applet blockquote button center dd del dir div dl dt fieldset form frameset hr iframe ins isindex li map menu noframes noscript object ol p pre script table tbody td tfoot th thead tr ul}) EMPTY = truth_hash(%w{area base basefont br col frame hr img input isindex link meta param embed}) INLINE = truth_hash(%w{abbr acronym applet b basefont bdo big br button cite code del dfn em font i iframe img input ins kbd label map object q s samp script select small span strike strong sub sup textarea tt u var}) CLOSE_SELF = truth_hash(%w{colgroup dd dt li options p td tfoot th thead tr}) SPECIAL = truth_hash(%w{script style}) FILL_IN_ATTRIBUTES = truth_hash(%w{checked compact declare defer disabled ismap multiple nohref noresize noshade nowrap readonly selected}) def initialize(html, handler, file_path=nil) @html = StringScanner.new(html) @handler = handler @file_path = file_path @stack = [] parse end def last @stack.last end def parse loop do if last && SPECIAL[last] # In a script or style tag, just look for the first end close_tag = "" body = @html.scan_until(/#{close_tag}/) special_tag(close_tag, body) elsif @html.scan(/\<\!--/) # start comment comment = @html.scan_until(/--\>/) comment = comment[0..-4] @handler.comment(comment) if @handler.respond_to?(:comment) elsif (tag = @html.scan(START_TAG)) tag_name = @html[1] rest = @html[2] unary = @html[3] start_tag(tag, tag_name, rest, unary) elsif @html.scan(END_TAG) tag_name = @html[1] end_tag(tag_name, tag_name) elsif (escaped = @html.scan(/\{\{\{(.*?)\}\}\}([^\}]|$)/)) # Anything between {{{ and }}} is escaped and not processed (treaded as text) if escaped[-1] != '}' # Move back if we matched a new non } for close, skip if we hit the end @html.pos = @html.pos - 1 end text(@html[1]) elsif (binding = @html.scan(/\{\{/)) # We are in text mode and matched the start of a binding start_binding elsif (text = @html.scan(/\{/)) # A single { outside of a binding text(text) elsif (text = @html.scan(/(?:[^\<\{]+)/)) # matched text up until the next html tag text(text) else # Nothing left break end end end_tag(nil, nil) end def text(text) @handler.text(text) if @handler.respond_to?(:text) end # Findings the end of a binding def start_binding binding = '' open_count = 1 # scan until we reach a {{ or }} loop do binding << @html.scan_until(/(\{\{|\}\}|\n|\Z)/) match = @html[1] if match == '}}' # close open_count -= 1 break if open_count == 0 elsif match == '{{' # open more open_count += 1 elsif match == "\n" || @html.eos? # Starting new tag, should be closed before this # or end of doc before closed binding raise_parse_error("unclosed binding: {#{binding.strip}") else raise "should not reach here" end end binding = binding[0..-3] @handler.binding(binding) if @handler.respond_to?(:binding) end def raise_parse_error(error) line_number = @html.pre_match.count("\n") + 1 error_str = error + " on line: #{line_number}" error_str += " of #{@file_path}" if @file_path raise HTMLParseError, error_str end def start_tag(tag, tag_name, rest, unary) section_tag = tag_name[0] == ':' && tag_name[1] =~ /[A-Z]/ tag_name = tag_name.downcase # handle doctype so we get it output exactly the same way if tag_name == '!doctype' @handler.text(tag) if @handler.respond_to?(:start_tag) return end # Auto-close the last inline tag if we started a new block if BLOCK[tag_name] if last && INLINE[last] end_tag(nil, last) end end # Some tags close themselves when a new one of themselves is reached. # ex, a tr will close the previous tr if CLOSE_SELF[tag_name] && last == tag_name end_tag(nil, tag_name) end unary = EMPTY[tag_name] || !unary.blank? # Section tag's are also unary unless unary || section_tag @stack.push(tag_name) end if @handler.respond_to?(:start_tag) attributes = {} # Take the rest string and extract the attributes, filling in any # "fill in" attribute values if not provided. rest.scan(ATTRIBUTES).each do |match| name = match[0] value = match[1] || match[2] || match[3] || FILL_IN_ATTRIBUTES[name] || '' attributes[name] = value end if section_tag @handler.start_section(tag_name, attributes, unary) else @handler.start_tag(tag_name, attributes, unary) end end end def end_tag(tag, tag_name) # If no tag name is provided, close all the way up new_size = 0 if tag # Find the closest tag that closes. (@stack.size-1).downto(0) do |index| if @stack[index] == tag_name new_size = index break end end end if new_size >= 0 if @handler.respond_to?(:end_tag) (@stack.size-1).downto(new_size) do |index| @handler.end_tag(@stack[index]) end end @stack = @stack[0...new_size] end end def special_tag(close_tag, body) body = body[0..((-1 * close_tag.size)-1)] body = body.gsub(/\<\!--(.*?)--\>/, "\\1").gsub(/\<\!\[CDATA\[(.*?)\]\]\>/, "\\1") text(body) end_tag(last, last) end end end