# # File:: HTMLProcessor.rb # Author:: wkm # Copyright:: 2009 # License:: GPL # # Defines an HTMLProcessor class that is used to compress and cleanup HTML # files for deployment with sitefuel. Uses hpricot to process the HTML. # module SiteFuel module Processor require 'hpricot' require 'sitefuel/processors/AbstractStringBasedProcessor' require 'sitefuel/processors/CSSProcessor' require 'sitefuel/processors/JavaScriptProcessor' class HTMLProcessor < AbstractStringBasedProcessor # # HTML ENTITIES # # quotes SINGLE_QUOTE_OPEN = '‘'.freeze SINGLE_QUOTE_CLOSE = '’'.freeze DOUBLE_QUOTE_OPEN = '“'.freeze DOUBLE_QUOTE_CLOSE = '”'.freeze # dashes EN_DASH = '–'.freeze EM_DASH = '—'.freeze # signs ELLIPSIS = '…'.freeze COPYRIGHT = '©'.freeze TRADEMARK = '™'.freeze REGISTERED = '®'.freeze # arrows ARROW_LEFTWARD = '←'.freeze ARROW_RIGHTWARD = '→'.freeze ARROW_LEFTRIGHT = '↔'.freeze ARROW_DOUBLE_LEFTWARD = '⇐'.freeze ARROW_DOUBLE_RIGHTWARD = '⇒'.freeze ARROW_DOUBLE_LEFTRIGHT = '⇔'.freeze # math operators MULTIPLICATION_SIGN = '×'.freeze # list of tags which have proper text items inside them TEXTUAL_TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'b', 'i', 'ul', 'a', 'li', 'td', 'th'].freeze # filter for use with XPath searches TEXTUAL_TAGS_FILTER = TEXTUAL_TAGS.join('|').freeze # gives the file patterns which this processor will match def self.file_patterns [ # plain html ".html", ".htm" ] end def self.default_filterset :minify end def self.filterset_minify [:whitespace, :minify_javascript, :minify_styles] end def self.filterset_beautify [:beautify_quotes, :beautify_dashes, :beautify_arrows, :beautify_symbols] end # # FILTERS # # before any filters are run parse the document with hpricot def setup_filters @htmlstruc = Hpricot.parse(document) end # after all the filters are run dump the HTML as a string and do a # tiny bit of post processing def finish_filters # do a last minute, ugly +br+ cleanup @document = @htmlstruc.to_s.gsub('
', '
') end def traverse(patterns = TEXTUAL_TAGS_FILTER, &block) (@htmlstruc/patterns).each do |tag| tag.traverse_text do |txt| block.call(tag.pathname, txt) end end end # strips excess whitespace in most HTML tags. Notably, +pre+ tags are # left alone. def filter_whitespace @htmlstruc.traverse_text do |txt| if /\A\s+\z/ =~ txt.content then txt.content = '' else txt.content = txt.content.gsub(/\s+/m, ' ') end end end # minifies embedded JavaScript code using the JavaScriptProcessor def filter_minify_javascript # TODO check the language attribute to make sure it's javascript traverse('script') do |tag,txt| txt.content = JavaScriptProcessor.process_string( txt.content, {:resource_name => resource_name+''} ) end end # minifies embedded CSS styles using the CSSProcessor def filter_minify_styles traverse('style') do |tag,txt| txt.content = CSSProcessor.process_string( txt.content, :resource_name => resource_name+'' ) end end # cleans up double and single quotes in textual objects #

"hello world"  =>  “ hello world”

def filter_beautify_quotes traverse do |tag,txt| txt.content = txt.content. # apostrophes gsub(/(\S)'(s)/i, '\1%s\2' % SINGLE_QUOTE_CLOSE). gsub(/(\Ss)'(\s)/i, '\1%s\2' % SINGLE_QUOTE_CLOSE). # double quotes gsub(/"(\S.*?\S)"/, '%s\1%s' % [DOUBLE_QUOTE_OPEN, DOUBLE_QUOTE_CLOSE]). # single quotes gsub(/'(\S.*?\S)'/, '%s\1%s' % [SINGLE_QUOTE_OPEN, SINGLE_QUOTE_CLOSE]) end end # cleans up the various dash forms: #

12--13  =>  12–13

the car---it was red---was destroyed  =>  ...—it was red—...

def filter_beautify_dashes traverse do |tag,txt| txt.content = txt.content. # between two numbers we have an en dash # this would be a bit cleaner with (negative) lookbehind gsub(/(\d)--(\d)/, "\\1#{EN_DASH}\\2"). # we can also have multiple en-dashes gsub(/\b(--(--)+)(\b|\z|\s)/) do || EN_DASH * ($1.length / 2) + $3 end. # three dashes in general are an em dash gsub(/(\s|\b)---(\s|\b)/, "\\1#{EM_DASH}\\2") end end # convert basic arrow forms to unicode characters def filter_beautify_arrows traverse do |tag,txt| txt.content = txt.content. gsub(/(\s|\b)-->(\s|\b)/, "\\1#{ARROW_RIGHTWARD}\\2"). gsub(/(\s|\b)<--(\s|\b)/, "\\1#{ARROW_LEFTWARD}\\2"). gsub(/(\s|\b)<->(\s|\b)/, "\\1#{ARROW_LEFTRIGHT}\\2"). gsub(/(\s|\b)==>(\s|\b)/, "\\1#{ARROW_DOUBLE_RIGHTWARD}\\2"). gsub(/(\s|\b)<==(\s|\b)/, "\\1#{ARROW_DOUBLE_LEFTWARD}\\2"). gsub(/(\s|\b)<=>(\s|\b)/, "\\1#{ARROW_DOUBLE_LEFTRIGHT}\\2") end end # converts 'x' signs between numbers into the unicode symbol def filter_beautify_math end # convert a few shorthands like (c), (tm) to their unicode symbols def filter_beautify_symbols traverse do |tag,txt| txt.content = txt.content. gsub(/$tm$/i, TRADEMARK). gsub(/$c$/i, COPYRIGHT). gsub(/$r$/i, REGISTERED). gsub(/(\b| )\.\.\.(\.)?/, "\\1#{ELLIPSIS}\\2") end end end end end