module MList module Util class HtmlTextExtraction # We need a way to maintain non-breaking spaces. Hpricot will replace # them with ??.chr. We can easily teach it to convert it to a space, but # then we lose the information in the Text node that we need to keep the # space around, since that is what they would see in a view of the HTML. NBSP = '!!!NBSP!!!' def initialize(html) @doc = Hpricot(html.gsub(' ', NBSP)) end def execute @text, @anchors = '', [] @doc.each_child do |node| extract_text_from_node(node) if Hpricot::Elem::Trav === node end @text.strip! unless @anchors.empty? refs = [] @anchors.each_with_index do |href, i| refs << "[#{i+1}] #{href}" end @text << "\n\n--\n#{refs.join("\n")}" end @text.gsub(NBSP, ' ') end def extract_text_from_node(node) case node.name when 'head' when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' @text << node.inner_text @text << "\n\n" when 'br' @text << "\n" when 'ol' node.children_of_type('li').each_with_index do |li, i| @text << " #{i+1}. #{li.inner_text}" @text << "\n\n" end when 'ul' node.children_of_type('li').each do |li| @text << " * #{li.inner_text.strip}" @text << "\n\n" end when 'strong' @text << "*#{node.inner_text}*" when 'em' @text << "_#{node.inner_text}_" when 'dl' node.traverse_element('dt', 'dd') do |dt_dd| extract_text_from_node(dt_dd) end when 'a' @anchors << node['href'] extract_text_from_text_node(node) @text << "[#{@anchors.size}]" when 'p', 'dt', 'dd' extract_text_from_children(node) @text.rstrip! @text << "\n\n" else extract_text_from_children(node) end end def extract_text_from_children(elem) elem.each_child do |node| case node when Hpricot::Text::Trav extract_text_from_text_node(node) when Hpricot::Elem::Trav extract_text_from_node(node) end end end def extract_text_from_text_node(node) text = @text.end_with?("\n") ? node.inner_text.lstrip : node.inner_text @text << text.gsub(/\s{2,}/, ' ').sub(/\n/, '') end end module EmailHelpers def sanitize_header(charset, name, *values) header_sanitizer(name).call(charset, *values) end def header_sanitizer(name) Util.default_header_sanitizers[name] end def html_to_text(html) HtmlTextExtraction.new(html).execute end def normalize_new_lines(text) text.to_s.gsub(/\r\n?/, "\n") end def subscriber_name_and_address(subscriber) a = subscriber.email_address a = "#{subscriber.display_name} #{bracket(a)}" if subscriber.respond_to?(:display_name) a end BRACKETS_RE = /\A<(.*?)>\Z/ def bracket(string) string.blank? || string =~ BRACKETS_RE ? string : "<#{string}>" end def remove_brackets(string) string =~ BRACKETS_RE ? $1 : string end REGARD_RE = /(^|[^\w])re: /i def remove_regard(string) while string =~ REGARD_RE string = string.sub(REGARD_RE, ' ') end string.strip end def text_to_html(text) lines = normalize_new_lines(text).split("\n") lines.collect! do |line| line = escape_once(line) line = (" " * $1.length) + $2 if line =~ /^(\s+)(.*?)$/ line = %{#{line}} if line =~ /^(>|[|]|[A-Za-z]+>)/ line = line.gsub(/\s\s/, ' ') line end lines.join("
\n") end def text_to_quoted(text) lines = normalize_new_lines(text).split("\n") lines.collect! do |line| '> ' + line end lines.join("\n") end HTML_ESCAPE = { '&' => '&', '>' => '>', '<' => '<', '"' => '"' } def escape_once(text) text.gsub(/[\"><]|&(?!([a-zA-Z]+|(#\d+));)/) { |special| HTML_ESCAPE[special] } end end end end