# Copyright (C) 2003 Ben Giddings # # Author Ben Giddings # # This file may be used under the same license as Ruby # # Description: # This is a partial port of the functionality behind Perl's TokeParser # Provided a page it progressively returns tokens from that page # $Id: htmltokenizer.rb,v 1.3 2004/03/04 21:28:16 merc Exp $ class HTMLTokenizer attr_reader :page def initialize(content) @page = content @cur_pos = 0 end def reset @cur_pos = 0 end def peekNextToken if @cur_pos == @page.length then return nil end if ?< == @page[@cur_pos] # Next token is a tag of some kind if '!--' == @page[(@cur_pos + 1), 3] # Token is a comment tag_end = @page.index('-->', (@cur_pos + 1)) if tag_end.nil? raise "No end found to started comment:\n#{@page[@cur_pos,80]}" end # p @page[@cur_pos .. (tag_end+2)] HTMLComment.new(@page[@cur_pos .. (tag_end + 2)]) else # Token is a html tag tag_end = @page.index('>', (@cur_pos + 1)) if tag_end.nil? raise "No end found to started tag:\n#{@page[@cur_pos,80]}" end # p @page[@cur_pos .. tag_end] HTMLTag.new(@page[@cur_pos .. tag_end]) end else # Next token is text text_end = @page.index('<', @cur_pos) text_end = text_end.nil? ? -1 : (text_end - 1) # p @page[@cur_pos .. text_end] HTMLText.new(@page[@cur_pos .. text_end]) end end def getNextToken token = peekNextToken if token # @page = @page[token.raw.length .. -1] # @page.slice!(0, token.raw.length) @cur_pos += token.raw.length end #p token #print token.raw return token end def getTag(*sought_tags) sought_tags.collect! {|elm| elm.downcase} while (tag = getNextToken) if tag.kind_of?(HTMLTag) and (0 == sought_tags.length or sought_tags.include?(tag.tag_name)) break end end tag end def getText(until_tag = nil) if until_tag.nil? if ?< == @page[@cur_pos] # Next token is a tag, not text "" else # Next token is text getNextToken.text end else ret_str = "" while (tag = peekNextToken) if tag.kind_of?(HTMLTag) and tag.tag_name == until_tag break end if ("" != tag.text) ret_str << (tag.text + " ") end getNextToken end ret_str end end def getTrimmedText(until_tag = nil) getText(until_tag).strip.gsub(/\s+/m, " ") end end class HTMLToken attr_accessor :raw def initialize(text) @raw = text end def to_s raw end def text # By default tokens have no text representation "" end def trimmed_text text.strip.gsub(/\s+/m, " ") end end class HTMLText < HTMLToken def text raw end end class HTMLComment < HTMLToken attr_accessor :contents def initialize(text) super(text) temp_arr = text.scan(/^$/m) if temp_arr[0].nil? raise "Text passed to HTMLComment.initialize is not a comment" end @contents = temp_arr[0][0] end end class HTMLTag < HTMLToken attr_reader :end_tag, :tag_name def initialize(text) super(text) if ?< != text[0] or ?> != text[-1] raise "Text passed to HTMLComment.initialize is not a comment" end @attr_hash = Hash.new @raw = text tag_name = text.scan(/[\w:]+/)[0] if tag_name.nil? raise "Error, tag is nil: #{tag_name}" end if ?/ == text[1] # It's an end tag @end_tag = true @tag_name = '/' + tag_name.downcase else @end_tag = false @tag_name = tag_name.downcase end @hashed = false end def attr_hash # Lazy initialize == don't build the hash until it's needed if !@hashed if !@end_tag # Get the attributes attr_arr = @raw.scan(/<[\w:]+\s+(.*)>/m)[0] if attr_arr.kind_of?(Array) # Attributes found, parse them attrs = attr_arr[0] attr_arr = attrs.scan(/\s*([\w:]+)(?:\s*=\s*("[^"]*"|'[^']*'|([^"'][^\s>]+)))?/m) # clean up the array by: # * setting all nil elements to true # * removing enclosing quotes attr_arr.each { |item| val = if item[1].nil? true elsif '"'[0] == item[1][0] or '\''[0] == item[1][0] item[1][1 .. -2] else item[1] end @attr_hash[item[0].downcase] = val } end end @hashed = true end #p self @attr_hash end def text if !end_tag case tag_name when 'img' if !attr_hash['alt'].nil? return attr_hash['alt'] end when 'applet' if !attr_hash['alt'].nil? return attr_hash['alt'] end end end return '' end end if $0 == __FILE__ page = " This is the title

This is the header

This is the paragraph, it contains links, images
are
really cool. Ok, here is some more text and another link.

" toke = HTMLTokenizer.new(page) p toke.getTag("IMG", "A") p toke.getTrimmedText #p toke.getTrimmedText("/P") #while (t = toke.getNextToken) # p t # print t.trimmed_text + " " #end #p toke.page end