lib/action_controller/vendor/html-scanner/html/tokenizer.rb in actionpack-3.0.0.rc vs lib/action_controller/vendor/html-scanner/html/tokenizer.rb in actionpack-3.0.0.rc2
- old
+ new
@@ -1,9 +1,9 @@
require 'strscan'
module HTML #:nodoc:
-
+
# A simple HTML tokenizer. It simply breaks a stream of text into tokens, where each
# token is a string. Each string represents either "text", or an HTML element.
#
# This currently assumes valid XHTML, which means no free < or > characters.
#
@@ -12,17 +12,17 @@
# tokenizer = HTML::Tokenizer.new(text)
# while token = tokenizer.next
# p token
# end
class Tokenizer #:nodoc:
-
+
# The current (byte) position in the text
attr_reader :position
-
+
# The current line number
attr_reader :line
-
+
# Create a new Tokenizer for the given text.
def initialize(text)
text.encode! if text.encoding_aware?
@scanner = StringScanner.new(text)
@position = 0
@@ -40,11 +40,11 @@
update_current_line(scan_tag)
else
update_current_line(scan_text)
end
end
-
+
private
# Treat the text at the current position as a tag, and scan it. Supports
# comments, doctype tags, and regular tags, and ignores less-than and
# greater-than characters within quoted strings.
@@ -67,17 +67,17 @@
# Scan all text up to the next < character and return it.
def scan_text
"#{@scanner.getch}#{@scanner.scan(/[^<]*/)}"
end
-
+
# Counts the number of newlines in the text and updates the current line
# accordingly.
def update_current_line(text)
text.scan(/\r?\n/) { @current_line += 1 }
end
-
+
# Skips over quoted strings, so that less-than and greater-than characters
# within the strings are ignored.
def consume_quoted_regions
text = ""
loop do
@@ -101,7 +101,7 @@
end
end
text
end
end
-
+
end