# Geoff Davis geoff at geoffdavis.net
# Wed May 2 20:08:44 EDT 2007
# http://rubyforge.org/pipermail/raleigh-rb-members/2007-May/000789.html
# modified by mtracy at matasano.com for WWMD
 
module WWMD
   InlineTags = ['a','abbr','acronym','address','b','bdo','big','cite','code','del','dfn','em','font','i','ins','kbd','label','noframes','noscript','q','s','samp','small','span','strike','strong','sub','sup','td','th','tt','u','html','body','table']
   BlockTags =  ['blockquote','br','center','dd','div','fieldset','form','h1','h2','h3', 'h4','h5','h6','hr','p','pre','tr','var',]
   ListTags =   ['dir','dl','menu','ol','ul']
   ItemTags =   ['li','dt']
#   AsciiEquivalents =  {"amp"=>"&","bull"=>"*","copy"=>"(c)","laquo"=>"<<","raquo"=>">>","ge"=> ">=","le"=>"<=","mdash"=>"-","ndash"=>"-","plusmn"=>"+/-","times"=>"x"}
 
#   NamedCharRegex = Regexp.new("(&("+Hpricot::NamedCharacters.keys.join("|")+");)")
 
  class Page
    def element_to_text(n)
      tag = n.etag || n.stag
      name = tag.name.downcase
      s = ""
      is_block  = BlockTags.include?(name)
      is_list   = ListTags.include?(name)
      is_item   = ItemTags.include?(name)
      is_inline = InlineTags.include?(name)
      if is_block or is_list or is_item or is_inline
        n.each_child do |c|
          s += node_to_text(c)
        end
        if is_block or is_list
          s += "\n"
        elsif is_item
          s = "* " + s + "\n"
        end
      end
      s
    end
 
    def node_to_text(n)
      return "" if n.comment?
      return element_to_text(n) if n.elem?
      return n.inner_text if n.text?
  
      s = ""
      begin
        n.each_child do |c|
          s += node_to_text(c)
        end
      rescue => e
#        puts "WARNING: #{e.inspect}"
      end
      return s
    end
 
 #   def lookup_named_char(s)
 #     c = Hpricot::NamedCharacters[s[1...-1]]
 #     c.chr if c
 #   end
 
    def html2text
      doc = self.scrape.hdoc
      text = node_to_text(doc)
#      text.gsub!(NamedCharRegex){|s| "#{lookup_named_char(s)}"}
      # clean up white space
      text.gsub!("\r"," ")
      text.squeeze!(" ")
      text.strip!
      ret = ''
      text.split(/\n/).each do |l|
        l.strip!
        next if l == ''
        next if l =~ /^\?+$/
        ret += "#{l}\n"
      end
      return ret
    end
  end
end