require File.dirname(__FILE__) + '/htmlentities' require 'hpricot' require 'highline/import' require 'active_support/core_ext/string/unicode' # This class formats stuff for human consumption class Display include CharacterCleaner attr_reader :width # In the future, may allow configuration options here, so that you can # customize the Display object. def initialize(options={}) # could opt for :no_links @options = options @width = options[:width] || 80 end def list_feeds(feeds) if @options[:curses] CursesController.new(self).show_feeds(feeds) else feeds.each {|f| display_feed(f)} end end def display_entries(entries) if @options[:curses] CursesController.new(self).show_entries(entries) else entries.each do |e| puts '-' * @width puts display_entry(e) end end end def display_entry(entry, show_title_and_feed = true) out = [] if show_title_and_feed out << "#{entry.feed.title} | #{entry.last_updated.strftime('%A, %B %d %Y')}" out << wrap_text(html_to_text(entry.title.strip)) end # description is the summary, so mark it as such if entry.description && entry.content && (entry.description.strip.length < entry.content.strip.length) out << "Entry Summary:\n\n" + html_to_text(entry.description).strip out << divider.strip out << "Entry Content:" end # If there is no content, just print the description if entry.content.nil? || entry.content.strip == '' out << html_to_text(entry.description || '').strip else out << html_to_text(entry.content || '').strip end unless @options[:simple] out << entry.url end unless entry.categories.empty? out << "Categories: #{entry.categories.join(", ")}" end out.join("\n\n") end def display_title_and_feed(entry) out = [] out << "#{entry.feed.title} | #{entry.last_updated.strftime('%A, %B %d %Y')}" out << wrap_text(html_to_text(entry.title.strip)) out.join("\n\n") end def display_title(entry) wrap_text(html_to_text(entry.title.strip)) end def display_feed(entry) "#{entry.feed.title} | #{entry.date_published.strftime('%A, %B %d %Y')}" end def display_raw_entry_content(entry) out = [] if entry.description out << divider + "Entry Summary\n\n" + entry.description end if entry.content out << "Entry Content\n\n" + entry.content end wrap_text(out.join(divider)).strip end def html_to_text(html) LOGGER.debug("html_to_text:\n #{html}") html.strip! # convert utf-8 to ascii html = process_entities_and_utf(html) # if there are no tags, and this is a body, wrap what looks like paragraph # in paragarph tags, so it can be processed with the html paragraph rule # below. if html =~ /\n/ && html !~ /<[^>]+>/ && html !~ /<\/[^>]+>/ html = html.split("\n\n").collect {|x| "
#{x}
"}.join("\n") end html, *links = links_to_footnotes(html) html = tags_to_text(html) html = normalize_blank_lines(html) html = wrap_text(html) html = [html, links.join("\n")].join("\n\n") #puts out #out = normalize_blank_lines(out) html.strip end # Make sure there is no more than one blank line anywhere def normalize_blank_lines(text) # get rid of ms line feeds text.gsub(/\r\n/, "\n"). # compress 3 or more blank lines to one gsub(/^\s*$/, "\n"). split(/\n\n\n*/).join( "\n\n") # hflush everything left to begin with #gsub(/^\s+(\w)/, '\1') end def tags_to_text(html) doc = Hpricot(html) doc.search('//comment()').remove doc.search('div') do |p| p.swap( "\n\n" + p.inner_text.gsub("\n", ' ').squeeze(' ').strip + "\n\n" ) end doc.search('p') do |p| p.swap( "\n\n" + p.inner_text.gsub("\n", ' ').squeeze(' ').strip + "\n\n" ) end doc.search('//blockquote') do |x| # compress extra spaces text = x.inner_text.squeeze(' ').strip # collapse the spacing in the text text.gsub!(/\s{2,}/, ' ') text = wrap_text(text, @width - 4).gsub(/^/, ' ') # indent 4 spaces x.swap("\n\n" + text + "\n\n") end doc.search('h1,h2,h3,h4') do |p| p.swap( "\n\n= #{p.inner_text}\n\n" ) end doc.search('//img') do |img| img.swap( "(img)" ) end doc.search('object').remove doc.search('table').remove doc.search('script').remove doc.search('//br') do |p| p.swap( "\n" ) end doc.search('i, b') do |p| p.swap( "*#{p.inner_text}*" ) end # anchor tags are processed after real links doc.search('a') do |p| p.swap( "#{p.inner_text}" ) end doc.search('dt') do |x| x.swap( "#{x.inner_html}:\n" ) end # This could be improved to insure an indentation even if there are nested # tag elements doc.search('dd') do |x| x.swap( "#{x.inner_text}\n" ) end doc.search('//span') do |s| s.swap( s.inner_text ) end doc.search('hr') do |s| s.swap( '-' * @width) end # Do this before erasing the enclosing