lib/action_controller/vendor/html-scanner/html/tokenizer.rb in actionpack-3.0.0.rc vs lib/action_controller/vendor/html-scanner/html/tokenizer.rb in actionpack-3.0.0.rc2

- old
+ new

@@ -1,9 +1,9 @@ require 'strscan' module HTML #:nodoc: - + # A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each # token is a string. Each string represents either "text", or an HTML element. # # This currently assumes valid XHTML, which means no free < or > characters. # @@ -12,17 +12,17 @@ # tokenizer = HTML::Tokenizer.new(text) # while token = tokenizer.next # p token # end class Tokenizer #:nodoc: - + # The current (byte) position in the text attr_reader :position - + # The current line number attr_reader :line - + # Create a new Tokenizer for the given text. def initialize(text) text.encode! if text.encoding_aware? @scanner = StringScanner.new(text) @position = 0 @@ -40,11 +40,11 @@ update_current_line(scan_tag) else update_current_line(scan_text) end end - + private # Treat the text at the current position as a tag, and scan it. Supports # comments, doctype tags, and regular tags, and ignores less-than and # greater-than characters within quoted strings. @@ -67,17 +67,17 @@ # Scan all text up to the next < character and return it. def scan_text "#{@scanner.getch}#{@scanner.scan(/[^<]*/)}" end - + # Counts the number of newlines in the text and updates the current line # accordingly. def update_current_line(text) text.scan(/\r?\n/) { @current_line += 1 } end - + # Skips over quoted strings, so that less-than and greater-than characters # within the strings are ignored. def consume_quoted_regions text = "" loop do @@ -101,7 +101,7 @@ end end text end end - + end