# = HTMLTokenizer # # Author:: Ben Giddings (mailto:bg-rubyforge@infofiend.com) # Copyright:: Copyright (c) 2004 Ben Giddings # License:: Distributes under the same terms as Ruby # # # This is a partial port of the functionality behind Perl's TokeParser # Provided a page it progressively returns tokens from that page # # $Id: htmltokenizer.rb,v 1.7 2005/06/07 21:05:53 merc Exp $ # # A class to tokenize HTML. # # Example: # # page = " # # This is the title # # # #

This is the header

# This is the paragraph, it contains # links, # images
# are
# really cool . Ok, here is some more text and # another link. #

# # # " # toke = HTMLTokenizer.new(page) # # assert("

" == toke.getTag("h1", "h2", "h3").to_s.downcase) # assert(HTMLTag.new("") == toke.getTag("IMG", "A")) # assert("links" == toke.getTrimmedText) # assert(toke.getTag("IMG", "A").attr_hash['optional']) # assert("_blank" == toke.getTag("IMG", "A").attr_hash['target']) # class HTMLTokenizer @@version = 1.0 # Get version of HTMLTokenizer lib def self.version @@version end attr_reader :page # Create a new tokenizer, based on the content, used as a string. def initialize(content) @page = content.to_s @cur_pos = 0 end # Reset the parser, setting the current position back at the stop def reset @cur_pos = 0 end # Look at the next token, but don't actually grab it def peekNextToken if @cur_pos == @page.length then return nil end if ?< == @page[@cur_pos] # Next token is a tag of some kind if '!--' == @page[(@cur_pos + 1), 3] # Token is a comment tag_end = @page.index('-->', (@cur_pos + 1)) if tag_end.nil? raise "No end found to started comment:\n#{@page[@cur_pos,80]}" end # p @page[@cur_pos .. (tag_end+2)] HTMLComment.new(@page[@cur_pos .. (tag_end + 2)]) else # Token is a html tag tag_end = @page.index('>', (@cur_pos + 1)) if tag_end.nil? raise "No end found to started tag:\n#{@page[@cur_pos,80]}" end # p @page[@cur_pos .. tag_end] HTMLTag.new(@page[@cur_pos .. tag_end]) end else # Next token is text text_end = @page.index('<', @cur_pos) text_end = text_end.nil? ? -1 : (text_end - 1) # p @page[@cur_pos .. text_end] HTMLText.new(@page[@cur_pos .. text_end]) end end # Get the next token, returns an instance of # * HTMLText # * HTMLToken # * HTMLTag def getNextToken token = peekNextToken if token # @page = @page[token.raw.length .. -1] # @page.slice!(0, token.raw.length) @cur_pos += token.raw.length end #p token #print token.raw return token end # Get a tag from the specified set of desired tags. # For example: # `foo = toke.getTag("h1", "h2", "h3")` # Will return the next header tag encountered. def getTag(sought_tags) sought_tags.collect! {|elm| elm.downcase} while (tag = getNextToken) if tag.kind_of?(HTMLTag) and (0 == sought_tags.length or sought_tags.include?(tag.tag_name)) break end end tag end # Get all the text between the current position and the next tag # (if specified) or a specific later tag def getText(until_tag = nil) if until_tag.nil? if ?< == @page[@cur_pos] # Next token is a tag, not text "" else # Next token is text getNextToken.text end else ret_str = "" while (tag = peekNextToken) if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag break end if ("" != tag.text) ret_str << (tag.text + " ") end getNextToken end ret_str end end # Like getText, but squeeze all whitespace, getting rid of # leading and trailing whitespace, and squeezing multiple # spaces into a single space. def getTrimmedText(until_tag = nil) getText(until_tag).strip.gsub(/\s+/m, " ") end end # The parent class for all three types of HTML tokens class HTMLToken attr_accessor :raw # Initialize the token based on the raw text def initialize(text) @raw = text end # By default, return exactly the string used to create the text def to_s raw end # By default tokens have no text representation def text "" end def trimmed_text text.strip.gsub(/\s+/m, " ") end # Compare to another based on the raw source def ==(other) raw == other.to_s end end # Class representing text that isn't inside a tag class HTMLText < HTMLToken def text raw end end # Class representing an HTML comment class HTMLComment < HTMLToken attr_accessor :contents def initialize(text) super(text) temp_arr = text.scan(/^$/m) if temp_arr[0].nil? raise "Text passed to HTMLComment.initialize is not a comment" end @contents = temp_arr[0][0] end end # Class representing an HTML tag class HTMLTag < HTMLToken attr_reader :end_tag, :tag_name def initialize(text) super(text) if ?< != text[0] or ?> != text[-1] raise "Text passed to HTMLComment.initialize is not a comment" end @attr_hash = Hash.new @raw = text tag_name = text.scan(/[\w:-]+/)[0] if tag_name.nil? raise "Error, tag is nil: #{tag_name}" end if ?/ == text[1] # It's an end tag @end_tag = true @tag_name = '/' + tag_name.downcase else @end_tag = false @tag_name = tag_name.downcase end @hashed = false end # Retrieve a hash of all the tag's attributes. # Lazily done, so that if you don't look at a tag's attributes # things go quicker def attr_hash # Lazy initialize == don't build the hash until it's needed if !@hashed if !@end_tag # Get the attributes attr_arr = @raw.scan(/<[\w:-]+\s+(.)>/m)[0] if attr_arr.kind_of?(Array) # Attributes found, parse them attrs = attr_arr[0] attr_arr = attrs.scan(/\s([\w:-]+)(?:\s=\s("[^"]"|'[^']'|([^"'>][^\s>])))?/m) # clean up the array by: # * setting all nil elements to true # * removing enclosing quotes attr_arr.each { |item| val = if item[1].nil? item[0] elsif '"'[0] == item[1][0] or '\''[0] == item[1][0] item[1][1 .. -2] else item[1] end @attr_hash[item[0].downcase] = val } end end @hashed = true end #p self @attr_hash end # Get the 'alt' text for a tag, if it exists, or an empty string otherwise def text if !end_tag case tag_name when 'img' if !attr_hash['alt'].nil? return attr_hash['alt'] end when 'applet' if !attr_hash['alt'].nil? return attr_hash['alt'] end end end return '' end end if $0 == FILE require 'test/unit' class TC_TestHTMLTokenizer < Test::Unit::TestCase def test_bad_link toke = HTMLTokenizer.new("
foo
") assert("http://bad.com/link" == toke.getTag("a").attr_hash['href']) end def test_namespace toke = HTMLTokenizer.new("") assert("http://www.com/foo" == toke.getTag("f:table").attr_hash['xmlns:f']) end def test_comment toke = HTMLTokenizer.new("") t = toke.getNextToken assert(HTMLComment == t.class) assert("comment on me" == t.contents) end def test_full page = " This is the title
This is the header

This is the paragraph, it contains links, . Ok, here is some more text and another link.
" toke = HTMLTokenizer.new(page) assert("
" == toke.getTag("h1", "h2", "h3").to_s.downcase) assert(HTMLTag.new("") == toke.getTag("IMG", "A")) assert("links" == toke.getTrimmedText) assert(toke.getTag("IMG", "A").attr_hash['optional']) assert("_blank" == toke.getTag("IMG", "A").attr_hash['target']) end end end